PTAR-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "PTAR"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/baseline

Average verified rate: 0.9258265879793642

Average of error rates: 0.0755695652173913

Total token count: 228923

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 30 )
Out[11]:
[('-', 612),
 ("'", 515),
 ('ñ', 232),
 (')', 230),
 ('th', 161),
 ('ch', 128),
 ('re-', 124),
 ('be-', 121),
 (']', 114),
 ('d', 113),
 ('¥', 110),
 ('com-', 97),
 ('con-', 97),
 ('ment', 92),
 ('tion', 84),
 ('n', 72),
 ('ver', 71),
 ('ly', 65),
 ('in-', 65),
 ('ex', 64),
 ('e', 62),
 ('x', 60),
 ('un-', 58),
 ('t', 57),
 ('*', 57),
 ('sab-', 56),
 ('de-', 56),
 ('ex-', 55),
 ('m', 54),
 ("'the", 54),
 ('w', 51),
 ('an-', 45),
 ("the'", 44),
 ('pro-', 44),
 ('ments', 43),
 ('ad-', 40),
 ('_', 39),
 ('the-', 39),
 ('r', 35),
 ('ñthe', 35),
 ('command-', 35),
 ('dis-', 35),
 ('pre-', 34),
 ('mandments', 34),
 ('-the', 32)]

Check Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)[:50]
Out[12]:
[('ñ', 232),
 (')', 230),
 (']', 114),
 ('¥', 110),
 ('*', 57),
 ('_', 39),
 ('ñthe', 35),
 ('(the', 26),
 ('(see', 24),
 ('[the', 21),
 ('(', 19),
 ('saysñ', 18),
 ('(or', 17),
 ('[', 14),
 ('(ps', 11),
 ('ñsee', 11),
 ('(margin', 10),
 ('ô', 10),
 ('ñthat', 10),
 ('(which', 10),
 ('[margin', 10),
 ('ñps', 8),
 ('/', 8),
 ('[rev', 8),
 ('[or', 8),
 ('(i', 8),
 ('(for', 7),
 ('[letter', 7),
 ('=', 7),
 ('(though', 7),
 ('[no', 7),
 ('(rev', 6),
 ('[see', 6),
 ('ñand', 6),
 ("ñ'", 6),
 ('ñto', 6),
 ('holies]', 6),
 ('cryñ', 5),
 ('(and', 5),
 ('saidñ', 5),
 ('(as', 5),
 ('ñwe', 5),
 ('[in', 5),
 ('(in', 5),
 ('(heb', 5),
 ('truthñthe', 4),
 ('worldña', 4),
 ('(to', 4),
 ('willñthat', 4),
 ('it)', 4)]

Correction 1 -- Normalize Characters

In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 1

In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction1

Average verified rate: 0.9364051306513091

Average of error rates: 0.06533913043478261

Total token count: 228509

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 621),
 ("'", 550),
 ('th', 162),
 ('ch', 131),
 ('re-', 124),
 ('be-', 121),
 ('d', 116),
 ('com-', 97),
 ('con-', 97),
 ('ment', 92),
 ('tion', 87),
 ('n', 77),
 ('ver', 74),
 ('ex', 70),
 ('e', 70),
 ('ly', 67),
 ('in-', 65),
 ('t', 64),
 ('x', 60),
 ('un-', 58),
 ('sab-', 56),
 ('de-', 56),
 ('ex-', 55),
 ("'the", 55),
 ('m', 54),
 ('w', 52),
 ('an-', 45),
 ("the'", 44),
 ('pro-', 44),
 ('ments', 43),
 ('ad-', 42),
 ('the-', 39),
 ('r', 35),
 ('mandments', 35),
 ('command-', 35),
 ('dis-', 35),
 ('pre-', 34),
 ('-the', 32),
 ('per-', 31),
 ('atone-', 30),
 ('ry', 29),
 ('--', 27),
 ('f', 27),
 ('tuary', 27),
 ('je-', 26),
 ('ble', 25),
 ('g', 25),
 ('ple', 25),
 ('tions', 24),
 ('mercy-seat', 23)]

Correction 2 -- Correct Line Endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 2

In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction2

Average verified rate: 0.9506618639049302

Average of error rates: 0.0509304347826087

Total token count: 224593

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 617),
 ("'", 550),
 ('th', 161),
 ('ch', 131),
 ('d', 116),
 ('n', 76),
 ('ver', 73),
 ('ex', 70),
 ('e', 70),
 ('t', 64),
 ('x', 58),
 ("'the", 55),
 ('m', 54),
 ('w', 52),
 ("the'", 44),
 ('ment', 43),
 ('r', 35),
 ('tion', 34),
 ('ly', 33),
 ('-the', 32),
 ('--', 27),
 ('f', 27),
 ('g', 25),
 ('mercy-seat', 24),
 ("'of", 23),
 ("and'", 20),
 ('ments', 20),
 ('sabbath-day', 19),
 ('scape-goat', 19),
 ("'and", 18),
 ('ry', 17),
 ("to'", 17),
 ('br', 17),
 ('eze', 15),
 ('vt', 15),
 ('-of', 14),
 ("'to", 13),
 ('re-', 13),
 ('nant', 13),
 ('-in', 12),
 ('tuary', 12),
 ('the-', 12),
 ('tions', 12),
 ('ful', 12),
 ('sabbath-days', 11),
 ('com-', 11),
 ("'was", 11),
 ('shut-door', 11),
 ('-and', 11),
 ('con-', 11)]

Correction 3 -- Remove Extra Dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
PTAR184907XX-V01-01-page1.txt: [('-II', 'II'), ('-', '')]
PTAR184907XX-V01-01-page3.txt: [('-', ''), ('COM-', 'COM'), ('PER-', 'PER'), ('-', '')]
PTAR184907XX-V01-01-page4.txt: [('-danger', 'danger'), ('-of', 'of'), ('COVE-', 'COVE'), ('God."-', 'God."'), ('-', ''), ('-', ''), ('-', ''), ('-two', 'two')]
PTAR184907XX-V01-01-page5.txt: [('-', ''), ('COM-', 'COM'), ('COV-', 'COV'), ('COMMAND-', 'COMMAND'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('COVE-', 'COVE'), ('-', '')]
PTAR184907XX-V01-01-page6.txt: [('-', ''), ('-', ''), ('BOND-', 'BOND'), ('-', '')]
PTAR184907XX-V01-01-page8.txt: [('-', ''), ('-', ''), ('no-', 'no')]
PTAR184907XX-V01-01-page9.txt: [('-this', 'this'), ('the-', 'the')]
PTAR184908XX-V01-02-page1.txt: [('-', ''), ('-', ''), ('-', '')]
PTAR184908XX-V01-02-page2.txt: [('-', '')]
PTAR184908XX-V01-02-page3.txt: [('-', ''), ('MIN-', 'MIN')]
PTAR184908XX-V01-02-page4.txt: [('-', '')]
PTAR184908XX-V01-02-page5.txt: [('maen-', 'maen'), ('-', ''), ('-', '')]
PTAR184908XX-V01-02-page6.txt: [('GUILT-', 'GUILT'), ('-', ''), ('RELAX-', 'RELAX'), ('hy-', 'hy')]
PTAR184908XX-V01-02-page7.txt: [('LAW-', 'LAW'), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
PTAR184908XX-V01-02-page8.txt: [('pre-', 'pre'), ('TRANS-', 'TRANS'), ('of-', 'of')]
PTAR184908XX-V01-03-page1.txt: [('no-', 'no'), ('continu--', 'continu-')]
PTAR184908XX-V01-03-page2.txt: [('-', ''), ('-the', 'the'), ('LOV-', 'LOV'), ('-', ''), ('-in', 'in')]
PTAR184908XX-V01-03-page3.txt: [('-', ''), ('-', ''), ('command-', 'command'), ('-whom', 'whom')]
PTAR184908XX-V01-03-page4.txt: [('COM-', 'COM'), ('-', ''), ('-', ''), ('-religion', 'religion')]
PTAR184908XX-V01-03-page6.txt: [('-the', 'the'), ('-art', 'art'), ('-the', 'the'), ('-over', 'over'), ('-', '')]
PTAR184909XX-V01-04-page1.txt: [('OP-', 'OP'), ('Sab-', 'Sab'), ('observ-', 'observ')]
PTAR184909XX-V01-04-page2.txt: [('-in', 'in'), ('-', ''), ('-', '')]
PTAR184909XX-V01-04-page3.txt: [('DISPER-', 'DISPER')]
PTAR184909XX-V01-04-page4.txt: [('-', '')]
PTAR184909XX-V01-04-page5.txt: [('-were', 'were')]
PTAR184909XX-V01-04-page6.txt: [('-', ''), ('commem-', 'commem'), ('requir-', 'requir'), ('peo-', 'peo')]
PTAR184909XX-V01-04-page7.txt: [('-', ''), ('-', ''), ('-if.', 'if.'), ('Sisters--', 'Sisters-')]
PTAR184912XX-V01-05-page1.txt: [('-', ''), ('perform--', 'perform-'), ('-', ''), ('-', ''), ('-written', 'written'), ('gospel."-', 'gospel."'), ('-', ''), ('-of', 'of'), ('IMMOR-', 'IMMOR'), ('-atonement', 'atonement')]
PTAR184912XX-V01-05-page2.txt: [('past-', 'past'), ('-exceedingly', 'exceedingly'), ('-.interesting', '.interesting'), ('-', ''), ('is-', 'is'), ('con-', 'con'), ('JUSTIFI-', 'JUSTIFI'), ('---', '--'), ('--', '-'), ('truth-', 'truth'), ('--the', '-the'), ('-SA', 'SA'), ('of-', 'of'), ('-', ''), ('-felt.', 'felt.')]
PTAR184912XX-V01-05-page3.txt: [('-vision.', 'vision.'), ('-has', 'has'), ('-', ''), ('-and', 'and'), ('the-', 'the'), ('-days', 'days'), ('-', ''), ('-', ''), ('Con-', 'Con'), ('wilder-', 'wilder'), ('settle-', 'settle'), ('-', ''), ('-Sister', 'Sister'), ('-this', 'this'), ('-or', 'or'), ('-Sitter', 'Sitter'), ('..thought-', '..thought'), ('myfeelings-', 'myfeelings'), ('-was', 'was'), ('-', ''), ('-Nom', 'Nom'), ('-ta', 'ta'), ('-to', 'to')]
PTAR184912XX-V01-05-page4.txt: [('-', '')]
PTAR184912XX-V01-05-page5.txt: [('-', ''), ('prepara-', 'prepara'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
PTAR184912XX-V01-05-page6.txt: [('them.-', 'them.'), ('-bowels', 'bowels')]
PTAR184912XX-V01-05-page7.txt: [('globe.--', 'globe.-'), ('--I', '-I'), ('of-', 'of')]
PTAR184912XX-V01-05-page8.txt: [('I.-', 'I.'), ('that-', 'that'), ('-', ''), ('-', ''), ('n-', 'n')]
PTAR184912XX-V01-06-page1.txt: [('-', ''), ('-take', 'take')]
PTAR184912XX-V01-06-page2.txt: [('-Also', 'Also'), ('-', ''), ('TABER-', 'TABER'), ('-', ''), ('-', '')]
PTAR184912XX-V01-06-page3.txt: [('-the', 'the'), ('-in', 'in'), ('-', ''), ('-', ''), ('-', ''), ('the-', 'the'), ('-is', 'is'), ('-', '')]
PTAR184912XX-V01-06-page4.txt: [('-those', 'those'), ('-this', 'this'), ('BE-', 'BE'), ('-procure', 'procure'), ('-If', 'If'), ('-lbregning', 'lbregning')]
PTAR184912XX-V01-06-page5.txt: [('-', ''), ('-', '')]
PTAR184912XX-V01-06-page6.txt: [('-in', 'in'), ('persecu-', 'persecu'), ('-the', 'the'), ('-In', 'In')]
PTAR184912XX-V01-06-page7.txt: [('interest-', 'interest'), ('dollar-', 'dollar'), ('-', '')]
PTAR184912XX-V01-06-page8.txt: [('malice.-', 'malice.'), ('-', ''), ('-why', 'why'), ('-regard', 'regard')]
PTAR185003XX-V01-07-page1.txt: [('-with', 'with'), ('WCT-', 'WCT'), ('REST-', 'REST'), ('REST-', 'REST'), ('-who', 'who'), ('-', '')]
PTAR185003XX-V01-07-page2.txt: [('-', ''), ('Sabbath-', 'Sabbath'), ('-', ''), ('COMMAND-', 'COMMAND'), ('COM-', 'COM'), ('con-', 'con')]
PTAR185003XX-V01-07-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-even', 'even'), ('-for', 'for'), ('--our', '-our'), ('-people', 'people'), ('-', ''), ('-', '')]
PTAR185003XX-V01-07-page4.txt: [('COMMAND-', 'COMMAND'), ('-', ''), ('-', ''), ('-', ''), ('cove-', 'cove'), ('-', ''), ('EN-', 'EN'), ('-', ''), ('-', ''), ('DE-', 'DE'), ('fall.-', 'fall.'), ('-because', 'because')]
PTAR185003XX-V01-07-page5.txt: [('-with', 'with'), ('--Cor.', '-Cor.'), ('-', '')]
PTAR185003XX-V01-07-page6.txt: [('-The', 'The'), ('-every', 'every'), ('"-', '"'), ('-"law', '"law'), ('of-', 'of'), ("'Gal.-", "'Gal."), ('-', ''), ('-we', 'we'), ('ThelairOf.-', 'ThelairOf.'), ('-', ''), ('-sin', 'sin'), ('-', ''), ('-then.we', 'then.we'), ("-the'", "the'"), ('-warning', 'warning'), ('-', ''), ('-the', 'the'), ('-plain', 'plain'), ('verses-', 'verses'), ('COMMAND-', 'COMMAND'), ('-', '')]
PTAR185003XX-V01-07-page7.txt: [('-earth', 'earth'), ('-', ''), ('--was', '-was'), ('of--', 'of-'), ('Jesusfor--', 'Jesusfor-'), ('-atonement', 'atonement'), ('-can', 'can'), ('-', ''), ('-the', 'the'), ('-Law', 'Law'), ('-"', '"'), ('-Cor.', 'Cor.'), ('-', ''), ('-the', 'the'), ('yptsp-', 'yptsp'), ('-', ''), ('.-', '.'), ('execu-', 'execu'), ('-..co', '..co'), ('-hut', 'hut'), ('-or', 'or'), ('-by', 'by'), ('-the', 'the'), ('-with', 'with'), ('new.-', 'new.'), ('-', ''), ('RIGHTEOUS-', 'RIGHTEOUS'), ('-', ''), ('-exposiqon', 'exposiqon'), ('-', ''), ('-God', 'God'), ('Moses.-', 'Moses.'), ('-', ''), ('condem.-', 'condem.'), ('MINIS-', 'MINIS'), ('--', '-'), ('de-', 'de'), ('the-', 'the'), ('-sin', 'sin'), ('-neither', 'neither'), ('-NOUTALITY.', 'NOUTALITY.'), ('-and', 'and'), ('-the', 'the'), ('"-.-', '"-.'), ('-holy', 'holy'), ('JUSTI-', 'JUSTI'), ('-', ''), ('righteousness"-', 'righteousness"'), ('-MORE.', 'MORE.'), ('which-', 'which'), ('-never.', 'never.'), ('away."-', 'away."')]
PTAR185003XX-V01-07-page8.txt: [('-', ''), ('-', ''), ('-believe', 'believe'), ('-', ''), ('-', ''), ('-will', 'will'), ('explana-', 'explana'), ('-who', 'who')]
PTAR185003XX-V01-08-page1.txt: [('-week', 'week'), ('-', ''), ('-said', 'said')]
PTAR185003XX-V01-08-page2.txt: [('--For', '-For'), ('-expostulates', 'expostulates')]
PTAR185003XX-V01-08-page3.txt: [('-consequenge.ire', 'consequenge.ire'), ('-"There', '"There'), ('re-', 're'), ('passion.L-', 'passion.L'), ('end-', 'end'), ('-In', 'In'), ('iron.-', 'iron.'), ('-', ''), ('Point-', 'Point'), ('govern-', 'govern'), ('father-', 'father'), ('-', ''), ('-', ''), ('-', '')]
PTAR185003XX-V01-08-page4.txt: [('-', ''), ('-marginal', 'marginal'), ('-Here', 'Here'), ('sanctua-', 'sanctua'), ('-', ''), ('-sins', 'sins'), ('else-', 'else'), ('-', ''), ('in-', 'in')]
PTAR185003XX-V01-08-page5.txt: [('HEAV-', 'HEAV')]
PTAR185003XX-V01-08-page6.txt: [('-', ''), ("'A.-", "'A."), ('-commandment', 'commandment'), ('-', ''), ('a-', 'a'), ('-six', 'six'), ('infi-', 'infi'), ('-', ''), ('other-', 'other'), ('"Watchnit-', '"Watchnit'), ('-', ''), ('-connected', 'connected'), ('-sanctuary', 'sanctuary'), ('di-', 'di'), ('-of', 'of')]
PTAR185003XX-V01-08-page7.txt: [('-of', 'of'), ('of-', 'of'), ('watchfulness.-', 'watchfulness.'), ('PEO-', 'PEO'), ('-Gentile', 'Gentile'), ('-be', 'be'), ('bless-', 'bless'), ('con-', 'con'), ('tho-', 'tho'), ('under-', 'under')]
PTAR185003XX-V01-08-page8.txt: [('-immediately', 'immediately')]
PTAR185004XX-V01-09-page1.txt: [('-', ''), ('-still', 'still'), ('-', '')]
PTAR185004XX-V01-09-page2.txt: [('-not', 'not'), ('-', ''), ('-"If', '"If'), ('-', ''), ('-', ''), ('fallen"--', 'fallen"-')]
PTAR185004XX-V01-09-page3.txt: [('KEEP-', 'KEEP'), ('-a', 'a'), ('--', '-'), ('against-', 'against')]
PTAR185004XX-V01-09-page5.txt: [('-which', 'which'), ('-', ''), ('-', '')]
PTAR185004XX-V01-09-page6.txt: [('-', ''), ('forlIttoith-', 'forlIttoith'), ('-that', 'that'), ('bring-', 'bring')]
PTAR185004XX-V01-09-page7.txt: [('-was', 'was'), ('-they', 'they'), ('do-', 'do'), ('-', '')]
PTAR185004XX-V01-09-page8.txt: [('-and', 'and'), ('field."--', 'field."-'), ('-My', 'My'), ('-', ''), ('-therefore', 'therefore')]
PTAR185005XX-V01-10-page1.txt: [('--', '-'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('beauti-', 'beauti')]
PTAR185005XX-V01-10-page2.txt: [('-', ''), ('ta-', 'ta'), ('-very', 'very'), ('-', ''), ('-"', '"'), ('New-', 'New'), ('ever-', 'ever'), ('-', ''), ('-', ''), ('reproach-', 'reproach'), ('-but', 'but'), ('-', ''), ('rut-', 'rut')]
PTAR185005XX-V01-10-page3.txt: [('be-', 'be'), ('pre-', 'pre'), ('in.--', 'in.-'), ('-', ''), ('-to', 'to'), ('over-', 'over'), ('-Matt.', 'Matt.'), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', '')]
PTAR185005XX-V01-10-page4.txt: [('-people', 'people'), ('HEAV-', 'HEAV'), ('IMA-', 'IMA'), ('-', ''), ('-', '')]
PTAR185005XX-V01-10-page5.txt: [('-above', 'above'), ('-power', 'power')]
PTAR185005XX-V01-10-page6.txt: [('-clearly', 'clearly'), ('-', ''), ('-', ''), ('-', ''), ('bride-', 'bride')]
PTAR185005XX-V01-10-page7.txt: [('-dealt', 'dealt'), ('treacher-', 'treacher'), ('-', ''), ('-living', 'living'), ('Lord-', 'Lord'), ('-', ''), ('-', ''), ('Rev.-', 'Rev.'), ('-', ''), ('-', ''), ('-', ''), ('ad-', 'ad')]
PTAR185005XX-V01-10-page8.txt: [('-by', 'by')]
PTAR185008XX-V01-01-page1.txt: [('-', '')]
PTAR185008XX-V01-01-page10.txt: [('wick-', 'wick'), ('-', '')]
PTAR185008XX-V01-01-page11.txt: [('pa-', 'pa'), ('-', ''), ('-and', 'and'), ('an-', 'an'), ('sev-', 'sev'), ('them-', 'them')]
PTAR185008XX-V01-01-page12.txt: [('-reproof.', 'reproof.'), ('Ag-', 'Ag'), ('it-', 'it')]
PTAR185008XX-V01-01-page13.txt: [('busi-', 'busi'), ('-has', 'has'), ('-would', 'would'), ('T-', 'T'), ('effect.-', 'effect.')]
PTAR185008XX-V01-01-page14.txt: [('MES-', 'MES'), ('-', ''), ('-vision', 'vision'), ('-us', 'us'), ('fin-', 'fin'), ('law-', 'law'), ('-it', 'it'), ('in-', 'in'), ('MESSAGE.-', 'MESSAGE.'), ('determin-', 'determin'), ('Sec-', 'Sec')]
PTAR185008XX-V01-01-page15.txt: [('Hast-', 'Hast'), ('-', ''), ('prayer.-', 'prayer.'), ('-', ''), ('judg-', 'judg')]
PTAR185008XX-V01-01-page16.txt: [('-', ''), ('uni-', 'uni'), ('-wicked', 'wicked'), ('-blessedness', 'blessedness'), ('-and', 'and')]
PTAR185008XX-V01-01-page17.txt: [('-us', 'us')]
PTAR185008XX-V01-01-page2.txt: [('large-', 'large'), ('move--', 'move-'), ('oc-', 'oc'), ('inways-', 'inways')]
PTAR185008XX-V01-01-page3.txt: [('ut-', 'ut'), ('commence-', 'commence'), ('do-', 'do'), ('-', ''), ('-', ''), ('how-', 'how')]
PTAR185008XX-V01-01-page4.txt: [('upo-', 'upo'), ('vari-', 'vari'), ('ex-', 'ex')]
PTAR185008XX-V01-01-page5.txt: [('-unaided', 'unaided'), ('-a', 'a'), ('pe-', 'pe'), ('com-', 'com')]
PTAR185008XX-V01-01-page6.txt: [('-evil', 'evil'), ('-', ''), ('re-', 're'), ('-...ndles', '...ndles'), ('pray-', 'pray')]
PTAR185008XX-V01-01-page7.txt: [('pros-', 'pros'), ('-filled', 'filled')]
PTAR185008XX-V01-01-page8.txt: [('vir-', 'vir'), ('EX-', 'EX'), ('-', '')]
PTAR185008XX-V01-01-page9.txt: [('-', ''), ('-', ''), ('pa-', 'pa'), ('-', ''), ('be-', 'be'), ('Sec-', 'Sec'), ('-Iam.', 'Iam.'), ('--Ihave', '-Ihave'), ('saved.-', 'saved.'), ('-', ''), ('fa-', 'fa'), ('and-', 'and'), ('pre-', 'pre'), ('.-', '.'), ('-and', 'and')]
PTAR185008XX-V01-02-page1.txt: [('group-', 'group'), ('-', ''), ('ediica-', 'ediica'), ('grace---', 'grace--'), ('-the', 'the'), ('door-', 'door'), ("-exceed'", "exceed'"), ('-', ''), ('-', '')]
PTAR185008XX-V01-02-page10.txt: [('who-', 'who'), ('-', ''), ('Provi-', 'Provi'), ('hav-', 'hav'), ('-candid"', 'candid"'), ('-', ''), ('-', ''), ('-up', 'up'), ('-partingof', 'partingof'), ('-never', 'never'), ('unim-', 'unim')]
PTAR185008XX-V01-02-page11.txt: [('-s', 's'), ('-', '')]
PTAR185008XX-V01-02-page12.txt: [('pro-', 'pro'), ('-', ''), ('-at', 'at'), ('-as', 'as'), ('Now-', 'Now'), ('pre-', 'pre'), ('provi-', 'provi'), ('--identified', '-identified'), ('--the', '-the'), ('--and', '-and'), ('tar-', 'tar')]
PTAR185008XX-V01-02-page13.txt: [('expound-', 'expound'), ('-', ''), ('-', ''), ('with-', 'with'), ('disap-', 'disap')]
PTAR185008XX-V01-02-page14.txt: [('ADVENT-', 'ADVENT'), ('explana-', 'explana'), ('dis-', 'dis'), ('-at', 'at'), ('-othertiine.', 'othertiine.'), ('illustration.-', 'illustration.'), ('-', ''), ('-in', 'in'), ('-answer', 'answer'), ('Provi-', 'Provi'), ('Matt.-', 'Matt.'), ('fulfill-', 'fulfill'), ('ie-', 'ie'), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('fallbili-', 'fallbili'), ('re-', 're'), ('-after', 'after'), ('-consequent', 'consequent')]
PTAR185008XX-V01-02-page15.txt: [('ar-', 'ar'), ('-', ''), ('-"', '"'), ('--', '-'), ('approv-', 'approv'), ('--', '-'), ('-shall', 'shall'), ('Je-', 'Je'), ('-', ''), ('-', '')]
PTAR185008XX-V01-02-page16.txt: [('the--', 'the-'), ('-"the', '"the'), ('-Advent', 'Advent'), ('God.-', 'God.'), ('-now', 'now'), ('need-', 'need'), ('abol-', 'abol'), ('tes-', 'tes'), ('-it', 'it'), ('"ever-', '"ever'), ('Je-', 'Je'), ('Arch-', 'Arch'), ('-perpetuity', 'perpetuity'), ('as-', 'as'), ('-trifling', 'trifling'), ('assem-', 'assem'), ('DAY."-', 'DAY."'), ('-occasioned', 'occasioned'), ('in-', 'in'), ('-haus.', 'haus.'), ('-', ''), ('Lord---', 'Lord--'), ('-', ''), ('-', ''), ('-words', 'words'), ('-of', 'of'), ('-the', 'the'), ('-', '')]
PTAR185008XX-V01-02-page2.txt: [('-', ''), ('-', ''), ('imme-', 'imme')]
PTAR185008XX-V01-02-page3.txt: [('-Virgins', 'Virgins'), ('specified-', 'specified'), ('-get', 'get'), ('-', ''), ('-that', 'that'), ('-', ''), ('pro-', 'pro'), ('-separate', 'separate'), ('dis-', 'dis'), ('lOtb.-', 'lOtb.'), ('-month', 'month'), ('-midnight', 'midnight'), ('-a', 'a'), ('-finally', 'finally'), ('prOcIa-', 'prOcIa')]
PTAR185008XX-V01-02-page4.txt: [('-', ''), ('-', '')]
PTAR185008XX-V01-02-page5.txt: [('con-', 'con'), ('.-', '.'), ('be-', 'be'), ('burn-', 'burn'), ('ex-', 'ex'), ('him.--', 'him.-'), ('po-', 'po'), ('-', ''), ('-a', 'a'), ('do-', 'do'), ('Vfaxis-', 'Vfaxis'), ('-', '')]
PTAR185008XX-V01-02-page6.txt: [('-and', 'and'), ('-', ''), ('AND-', 'AND'), ('-from', 'from'), ('mes-', 'mes'), ('-after', 'after')]
PTAR185008XX-V01-02-page7.txt: [('-.devising', '.devising'), ('-', ''), ('-plicityof', 'plicityof'), ('The-', 'The'), ('-we', 'we')]
PTAR185008XX-V01-02-page8.txt: [('Crea-', 'Crea'), ('-', ''), ('in-', 'in'), ('there-', 'there'), ('un-', 'un'), ('recorded-', 'recorded'), ('-In', 'In'), ('rerip-', 'rerip'), ('tend-', 'tend'), ('--the', '-the'), ('prophet-', 'prophet'), ('-which', 'which'), ('"-', '"'), ('cor-', 'cor')]
PTAR185008XX-V01-02-page9.txt: [('empires-', 'empires'), ('minu-', 'minu'), ('com-', 'com'), ('ser-', 'ser'), ('num-', 'num'), ('con-', 'con'), ('witness-', 'witness'), ('Je-', 'Je')]
PTAR185009XX-V01-03-page1.txt: [('-', ''), ('-', ''), ('suc-', 'suc'), ('print-', 'print'), ('-', ''), ('CERTAINTY-', 'CERTAINTY'), ('-to', 'to'), ('-', ''), ('success-', 'success')]
PTAR185009XX-V01-03-page10.txt: [('Is-', 'Is'), ('-other', 'other')]
PTAR185009XX-V01-03-page11.txt: [('ADVENT-', 'ADVENT'), ('cove-', 'cove'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Sanctua-', 'Sanctua'), ('cove-', 'cove'), ('-', '')]
PTAR185009XX-V01-03-page12.txt: [('Sam.-', 'Sam.'), ('-Aholiab', 'Aholiab'), ('-wisdom', 'wisdom'), ('chant-', 'chant'), ('-vessele', 'vessele'), ('-', ''), ('for-', 'for'), ('taberna-', 'taberna'), ('-', ''), ('-', ''), ('-in', 'in'), ('-', ''), ('-thereof', 'thereof'), ('-set', 'set'), ('-Lord', 'Lord'), ('"Purifica-', '"Purifica'), ('desir--', 'desir-'), ('-', ''), ('Sanctuaryl-', 'Sanctuaryl')]
PTAR185009XX-V01-03-page13.txt: [('-', ''), ('-prove', 'prove'), ('-Aaron', 'Aaron'), ('-', ''), ('-', ''), ('-', ''), ('-land', 'land'), ('-', ''), ('bet-', 'bet'), ('con-', 'con'), ('-he', 'he'), ('offollowed-', 'offollowed'), ('-ef', 'ef'), ('.-', '.'), ('flesh-', 'flesh'), ('-', ''), ('-also', 'also')]
PTAR185009XX-V01-03-page14.txt: [('-heaven', 'heaven'), ('of-', 'of'), ('of-', 'of'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('individu-', 'individu'), ('-New', 'New'), ('Le-', 'Le')]
PTAR185009XX-V01-03-page15.txt: [('-', ''), ('enter-', 'enter'), ('-', ''), ('---Last', '--Last')]
PTAR185009XX-V01-03-page16.txt: [('-', ''), ('-', ''), ('-', ''), ('-they', 'they'), ('-', ''), ('wa-', 'wa'), ('-', ''), ('seventy-', 'seventy'), ('meet-', 'meet'), ('re-', 're')]
PTAR185009XX-V01-03-page2.txt: [('-heaven', 'heaven'), ('proph-', 'proph'), ('inter-', 'inter'), ('in-', 'in')]
PTAR185009XX-V01-03-page3.txt: [('char-', 'char'), ('de-', 'de'), ('wil-', 'wil'), ('com-', 'com'), ('-God', 'God'), ('-To', 'To')]
PTAR185009XX-V01-03-page4.txt: [('-"', '"'), ('-', ''), ('-', ''), ('-', ''), ('employ-', 'employ'), ('-is', 'is'), ('-to', 'to'), ('remain-', 'remain')]
PTAR185009XX-V01-03-page5.txt: [('-as', 'as'), ('the-', 'the'), ('IVIan-', 'IVIan'), ('Bride-', 'Bride'), ('corn-', 'corn'), ('an-', 'an')]
PTAR185009XX-V01-03-page6.txt: [('Ad-', 'Ad'), ('de-', 'de'), ('un-', 'un'), ('Command-', 'Command'), ('com-', 'com'), ('-', ''), ('"-', '"'), ('the-', 'the'), ('fool-', 'fool'), ('NOT-', 'NOT'), ('ser-', 'ser'), ('-', '')]
PTAR185009XX-V01-03-page7.txt: [('"In-', '"In'), ('-God', 'God'), ('-away', 'away'), ('-', ''), ('pave-', 'pave'), ('-as', 'as'), ('-a', 'a'), ('Ad-', 'Ad'), ('-', '')]
PTAR185009XX-V01-03-page8.txt: [('Jeho-', 'Jeho'), ('first.-', 'first.'), ('-', ''), ('-', ''), ('coun-', 'coun'), ('-', ''), ('since.-', 'since.'), ('-not', 'not'), ('-', ''), ('great-', 'great'), ('-', ''), ('pub-', 'pub'), ('-', ''), ('-all', 'all'), ('-had', 'had'), ('-Those', 'Those'), ('-', '')]
PTAR185009XX-V01-03-page9.txt: [('-', ''), ('than-', 'than'), ('-', ''), ('-observes', 'observes'), ('-maintain', 'maintain'), ('-compel', 'compel'), ('----', '---'), ('-', ''), ('-', ''), ('-It', 'It'), ('clannessed-', 'clannessed'), ('-the', 'the'), ('us-', 'us'), ('--still', '-still'), ('treatit.-', 'treatit.'), ("hereafter.'-", "hereafter.'"), ('-known', 'known'), ('-', ''), ('-according', 'according'), ('unwil-', 'unwil'), ('-', ''), ('-your', 'your'), ('-', ''), ('Veri-', 'Veri'), ('-verily', 'verily'), ('-what', 'what'), ('to-', 'to'), ('now-', 'now'), ('-been', 'been'), ('-belong', 'belong'), ('-ordinances', 'ordinances'), ('-does', 'does'), ('-his', 'his'), ('--on', '-on'), ('imervation.--L-', 'imervation.--L'), ('-who', 'who')]
PTAR185009XX-V01-04-page1.txt: [('DAY-', 'DAY'), ('-', ''), ('-But', 'But'), ('THOU-', 'THOU'), ('world-', 'world'), ('-', ''), ('-', ''), ('JUDG-', 'JUDG')]
PTAR185009XX-V01-04-page10.txt: [('-', ''), ('-', ''), ('atone-', 'atone'), ('-', ''), ('-', '')]
PTAR185009XX-V01-04-page11.txt: [('-', ''), ('-', ''), ('reali-', 'reali'), ('ini--', 'ini-'), ('-', ''), ('-definite', 'definite'), ('atone-', 'atone'), ('taber-', 'taber'), ('-', ''), ('-', '')]
PTAR185009XX-V01-04-page12.txt: [('-', ''), ('trans-', 'trans'), ('for-', 'for'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-"baptized', '"baptized')]
PTAR185009XX-V01-04-page13.txt: [('do-', 'do'), ('objec-', 'objec'), ('in-', 'in'), ('hea-', 'hea'), ('abomi-', 'abomi'), ('-', '')]
PTAR185009XX-V01-04-page14.txt: [('Je-', 'Je'), ('un-', 'un'), ('wasful-', 'wasful'), ('-examined.', 'examined.'), ('sa-', 'sa'), ('-', ''), ('-', ''), ('He-', 'He')]
PTAR185009XX-V01-04-page15.txt: [('-without', 'without'), ('un-', 'un'), ('Won-', 'Won'), ('cleans-', 'cleans')]
PTAR185009XX-V01-04-page16.txt: [('-truth.', 'truth.'), ('Eno-', 'Eno'), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', '')]
PTAR185009XX-V01-04-page2.txt: [('-', ''), ('na-', 'na'), ('-from', 'from')]
PTAR185009XX-V01-04-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-but', 'but'), ('--He', '-He'), ('A-', 'A'), ('-Bro.', 'Bro.'), ("---'", "--'")]
PTAR185009XX-V01-04-page4.txt: [('--', '-'), ('before.--', 'before.-'), ('vol-', 'vol')]
PTAR185009XX-V01-04-page5.txt: [('ac-', 'ac'), ('-', ''), ('as-', 'as'), ('fulfilled.--', 'fulfilled.-'), ('under-', 'under'), ('faith-', 'faith'), ('Universal-', 'Universal')]
PTAR185009XX-V01-04-page6.txt: [('infor-', 'infor'), ('di-', 'di'), ('-', ''), ('yes.---', 'yes.--'), ('-ahead.', 'ahead.'), ('-the', 'the')]
PTAR185009XX-V01-04-page7.txt: [('-', ''), ('satisfac-', 'satisfac'), ('scarce-', 'scarce'), ('-unbound', 'unbound'), ('faith--', 'faith-'), ('-', ''), ('-.', '.'), ('-', '')]
PTAR185009XX-V01-04-page8.txt: [('-', ''), ('-For', 'For'), ('THOU-', 'THOU'), ('-', ''), ('-apply', 'apply'), ('-was', 'was'), ('-', ''), ('ser-', 'ser'), ('Rev.-', 'Rev.'), ('-', ''), ('-', ''), ('sup-', 'sup')]
PTAR185009XX-V01-04-page9.txt: [('-', ''), ('saying-', 'saying'), ('-', ''), ('up-', 'up'), ('-', ''), ('-', ''), ('-', ''), ('be-', 'be'), ('-at', 'at'), ('-', ''), ('P-', 'P'), ('-', ''), ('forgiVe-', 'forgiVe'), ('-', ''), ('-Lord', 'Lord'), ('-court', 'court'), ('I-', 'I')]
PTAR185009XX-V01-EX-page10.txt: [('the-', 'the'), ('moun-', 'moun'), ('super-', 'super'), ('two-horn-', 'two-horn'), ('-is', 'is')]
PTAR185009XX-V01-EX-page11.txt: [('-', ''), ('Mon-', 'Mon'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('promi-', 'promi'), ('-', ''), ('-Itt', 'Itt'), ('-a', 'a'), ('-', ''), ('COMMAND-', 'COMMAND'), ('-', ''), ('con-', 'con')]
PTAR185009XX-V01-EX-page12.txt: [('of-', 'of'), ('-day', 'day'), ('-', ''), ('-', ''), ('-', ''), ('enjoin-', 'enjoin'), ('-', '')]
PTAR185009XX-V01-EX-page13.txt: [('-', ''), ('-', ''), ('-', ''), ('WON-', 'WON'), ('-', ''), ('re-', 're'), ('--the', '-the'), ('with-', 'with'), ('salva-', 'salva')]
PTAR185009XX-V01-EX-page14.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('ta-', 'ta'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('him-', 'him'), ('corn-', 'corn')]
PTAR185009XX-V01-EX-page15.txt: [('Sanc-', 'Sanc'), ('-borne', 'borne'), ('FIL-', 'FIL'), ('per-', 'per'), ('giV-', 'giV'), ('-and', 'and'), ('ene-', 'ene'), ('-', '')]
PTAR185009XX-V01-EX-page16.txt: [('-', ''), ('-but', 'but'), ('heav-', 'heav'), ('-', ''), ('pre-', 'pre'), ('-', ''), ('-', ''), ('de-', 'de'), ('na-', 'na'), ('trans-', 'trans')]
PTAR185009XX-V01-EX-page17.txt: [('-', ''), ('af-', 'af'), ('un-', 'un'), ('desti-', 'desti'), ("-ever.'", "ever.'"), ('FIL-', 'FIL'), ('--', '-'), ('-', ''), ('--', '-'), ('--', '-')]
PTAR185009XX-V01-EX-page2.txt: [('-', ''), ('stood.-', 'stood.'), ('-.As', '.As'), ('unnecessa-', 'unnecessa'), ('RE-', 'RE'), ('truth.--', 'truth.-'), ('-art', 'art'), ('hot."-', 'hot."'), ('-Like', 'Like'), ('-day', 'day'), ('-', ''), ('an-', 'an'), ('-', ''), ('door-', 'door'), ('-they', 'they')]
PTAR185009XX-V01-EX-page3.txt: [('-', ''), ('af-', 'af'), ('com-', 'com'), ('-mandments', 'mandments'), ('-', '')]
PTAR185009XX-V01-EX-page4.txt: [('-', ''), ('the-cengre-', 'the-cengre'), ('"Be-', '"Be'), ('-', ''), ('-', ''), ('-shut', 'shut'), ('-the', 'the'), ('-white', 'white'), ('ex-', 'ex'), ('-', ''), ('death.-', 'death.'), ('-there', 'there'), ('-', ''), ('-to', 'to'), ('-Eze.', 'Eze.'), ('-', ''), ('-', ''), ('-life', 'life'), ('hea-', 'hea'), ('-rebuke', 'rebuke'), ('-', ''), ('.--', '.-'), ('-', ''), ('al-', 'al'), ('-who', 'who'), ('-yet', 'yet'), ('-nearer-a', 'nearer-a'), ('class-', 'class')]
PTAR185009XX-V01-EX-page5.txt: [('Jeru-', 'Jeru'), ('in-', 'in'), ('es-', 'es'), ('-', ''), ('after-', 'after'), ('-not', 'not'), ('-', ''), ('anti-', 'anti'), ("-Ione.'", "Ione.'")]
PTAR185009XX-V01-EX-page6.txt: [('-in', 'in'), ('-we', 'we'), ('Jo-', 'Jo'), ('-', ''), ('away-', 'away'), ('ad-', 'ad'), ('-', '')]
PTAR185009XX-V01-EX-page7.txt: [('Corn-', 'Corn'), ('-', ''), ('ser-', 'ser'), ('scat-', 'scat'), ('-', ''), ('pow-', 'pow'), ('pro-', 'pro'), ('-Herald', 'Herald'), ('-', ''), ('-', ''), ('-of', 'of'), ('-do', 'do'), ('-be', 'be'), ('ii-', 'ii'), ('-', ''), ('-', ''), ('-', ''), ('field.-', 'field.')]
PTAR185009XX-V01-EX-page8.txt: [('-steppinginto', 'steppinginto'), ('-', ''), ('ad-', 'ad'), ('the-', 'the')]
PTAR185009XX-V01-EX-page9.txt: [('-says', 'says'), ('-See', 'See'), ('-', ''), ('-', ''), ('-', '')]
PTAR185011XX-V01-05-page1.txt: [('-.-', '.-'), ('-Publishing', 'Publishing'), ('\'"ittee-', '\'"ittee'), ('-', ''), ('-wellthatthey', 'wellthatthey'), ('BABY-', 'BABY'), ('-', ''), ('founda-', 'founda'), ('-', ''), ('--Hence', '-Hence'), ('-out', 'out'), ('-the', 'the'), ('-to', 'to'), ("'try-", "'try"), ('organized-', 'organized'), ('-declaring', 'declaring'), ('-', ''), ('and-', 'and'), ('-', ''), ('intro-', 'intro'), ('-ferithe', 'ferithe'), ('-', ''), ('-', ''), ('in-', 'in'), ('-', ''), ('-arose', 'arose'), ('-had', 'had'), ('-What', 'What'), ('-hat', 'hat'), ('-in', 'in'), ('-', ''), ('-lac-', 'lac-'), ('-the', 'the'), ('-', '')]
PTAR185011XX-V01-05-page2.txt: [('proces-', 'proces'), ('-of', 'of'), ('proclai-', 'proclai'), ('-my', 'my'), ('-Now', 'Now'), ('per-', 'per')]
PTAR185011XX-V01-05-page3.txt: [('-with', 'with'), ('-', ''), ('quoting-', 'quoting'), ('-represents', 'represents'), ('be-', 'be'), ('-agreeing.', 'agreeing.'), ('harmo-', 'harmo'), ('there--', 'there-'), ('-of', 'of'), ('ceas-', 'ceas'), ('-', '')]
PTAR185011XX-V01-05-page4.txt: [('-them', 'them'), ('-', ''), ('mes-', 'mes'), ('--just', '-just'), ('-seventy-five', 'seventy-five'), ('-God', 'God'), ('-unbelievers.', 'unbelievers.'), ('-', ''), ('-of', 'of'), ('clo-', 'clo'), ('-July', 'July'), ('-', ''), ('Advent-Li-', 'Advent-Li'), ('No..-', 'No..'), ('mes--', 'mes-'), ('man-', 'man'), ('of-', 'of'), ("-mother's", "mother's"), ('-great', 'great'), ('unparal-', 'unparal'), ('-seventlimonth.', 'seventlimonth.'), ('some-', 'some'), ('-something', 'something')]
PTAR185011XX-V01-05-page5.txt: [('-are', 'are'), ('-experience', 'experience'), ('fin-', 'fin'), ('-Gone', 'Gone'), ('-', ''), ('-', ''), ('-to', 'to'), ('be-', 'be'), ('emir-', 'emir'), ('-we', 'we'), ('-is', 'is')]
PTAR185011XX-V01-05-page6.txt: [('-Upon', 'Upon'), ('-with', 'with'), ('-the', 'the'), ('how-', 'how'), ('-she', 'she'), ('tall-', 'tall'), ('-the', 'the'), ('-dren', 'dren'), ('res-', 'res'), ('-', ''), ('Jeru-', 'Jeru'), ('carbun-', 'carbun'), ('Jeru-', 'Jeru'), ('-her', 'her'), ('Je-', 'Je'), ('ino-', 'ino')]
PTAR185011XX-V01-05-page7.txt: [('-Does', 'Does'), ('Je-', 'Je'), ('--Hold', '-Hold'), ('Lord.God-', 'Lord.God'), ('-for', 'for'), ('-', ''), ('-', ''), ('-', ''), ('--"It', '-"It'), ('-not', 'not'), ('idea.-', 'idea.'), ('-', ''), ('kiah.-', 'kiah.'), ('-', ''), ('-', ''), ('-"Know', '"Know'), ('-', '')]
PTAR185011XX-V01-05-page8.txt: [('solo-', 'solo'), ('command--', 'command-'), ('-beloVed', 'beloVed'), ('rims-', 'rims'), ('-Share', 'Share'), ('-the', 'the'), ('-answer', 'answer'), ('-church', 'church'), ('done.-', 'done.'), ('-by', 'by'), ('faith-', 'faith'), ('prejudi-', 'prejudi'), ('life-', 'life'), ('-tat', 'tat'), ('walk-', 'walk'), ('ordinances-', 'ordinances'), ('-which', 'which'), ('par-', 'par'), ('-The', 'The'), ('-', ''), ('-prospect', 'prospect'), ('-work', 'work'), ('The.-', 'The.'), ('-partially', 'partially'), ('-', ''), ('-', ''), ('im-', 'im'), ('-arnongthe', 'arnongthe'), ('to-', 'to'), ('-', ''), ('Louata-', 'Louata'), ('-', ''), ('-', ''), ('-The', 'The'), ('-', ''), ('-be', 'be')]
PTAR185011XX-V01-11-page1.txt: [('-TRUTH', 'TRUTH'), ('-', ''), ("Thro'-", "Thro'"), ('-..abbotti', '..abbotti'), ('-', '')]
PTAR185011XX-V01-11-page2.txt: [('dan-', 'dan'), ('called-', 'called'), ('-', ''), ('in-', 'in'), ('-the', 'the'), ('coetro-', 'coetro'), ('Pa--', 'Pa-'), ('commandwe-', 'commandwe'), ('harm-', 'harm'), ('observe-', 'observe')]
PTAR185011XX-V01-11-page3.txt: [('OB-', 'OB'), ('-No', 'No'), ('commandment.-', 'commandment.'), ('spe-', 'spe'), ('-such', 'such'), ('exposi-', 'exposi'), ('-seventh.', 'seventh.'), ('-', ''), ('-indefinitely."', 'indefinitely."'), ('fol-', 'fol'), ('-proves.', 'proves.'), ('guilt-', 'guilt'), ("'how-", "'how")]
PTAR185011XX-V01-11-page4.txt: [('-day', 'day'), ('Scrip-', 'Scrip'), ('restora-', 'restora'), ('there-', 'there'), ('an-', 'an'), ('sus-', 'sus'), ('-', ''), ('ex-', 'ex'), ('re-', 're')]
PTAR185011XX-V01-11-page5.txt: [('-T.', 'T.'), ('-started', 'started'), ('-unto', 'unto'), ('-all', 'all'), ('York-', 'York'), ('HOLT-', 'HOLT')]
PTAR185011XX-V01-11-page6.txt: [('-of', 'of'), ('-of', 'of'), ('-he', 'he'), ('-to', 'to'), ('separa-', 'separa'), ('ar-', 'ar'), ('-saints', 'saints'), ('-', '')]
PTAR185011XX-V01-11-page7.txt: [('-that', 'that'), ('c.-', 'c.'), ('the-', 'the'), ('-and', 'and'), ('-', ''), ('Testa-', 'Testa'), ('-', ''), ('-', ''), ('Gentiles.-', 'Gentiles.'), ('chap--', 'chap-'), ('-See', 'See'), ('-', ''), ('-', ''), ('-', ''), ('direct-', 'direct'), ('-of', 'of'), ('blind-', 'blind')]
PTAR185011XX-V01-11-page8.txt: [('em-', 'em'), ('-', ''), ('an-', 'an'), ("'-", "'"), ('-', ''), ('-.', '.'), ('rug-', 'rug'), ('Sab-', 'Sab'), ('-and', 'and')]
PTAR1850XXXX-VXX-XX-page1.txt: [('-page', 'page')]
PTAR1850XXXX-VXX-XX-page10.txt: [('-', ''), ('-', ''), ('-', ''), ('Sec-', 'Sec'), ('saved.-', 'saved.'), ('-', ''), ('pre-', 'pre'), ('.-', '.')]
PTAR1850XXXX-VXX-XX-page11.txt: [('wick-', 'wick'), ("--'", "-'"), ('-understand', 'understand'), ("--'", "-'")]
PTAR1850XXXX-VXX-XX-page12.txt: [('pa-', 'pa'), ('-', '')]
PTAR1850XXXX-VXX-XX-page13.txt: [('Millerism.--', 'Millerism.-'), ('-will', 'will'), ('imperiously-', 'imperiously'), ('-', ''), ('long-', 'long'), ('AP-', 'AP'), ('it-', 'it')]
PTAR1850XXXX-VXX-XX-page14.txt: [('busi-', 'busi'), ('recent-', 'recent'), ('world-', 'world')]
PTAR1850XXXX-VXX-XX-page15.txt: [('-what', 'what'), ('sal-', 'sal'), ('-know', 'know'), ('-', ''), ('-shall', 'shall'), ('admon.-', 'admon.')]
PTAR1850XXXX-VXX-XX-page16.txt: [('-', ''), ('territo-', 'territo')]
PTAR1850XXXX-VXX-XX-page17.txt: [('-days.', 'days.'), ('-', ''), ('-', '')]
PTAR1850XXXX-VXX-XX-page18.txt: [('ever-', 'ever'), ('Bride-', 'Bride'), ('acknowledge-', 'acknowledge'), ('-Babylon', 'Babylon'), ('-', ''), ('po-', 'po'), ('torch-', 'torch'), ('-', '')]
PTAR1850XXXX-VXX-XX-page19.txt: [('do-', 'do'), ('Mans-', 'Mans'), ('-', ''), ('-', ''), ('glo-', 'glo')]
PTAR1850XXXX-VXX-XX-page20.txt: [('fa-', 'fa'), ('lay-', 'lay'), ('-', ''), ('eannotr--', 'eannotr-'), ('-the', 'the')]
PTAR1850XXXX-VXX-XX-page21.txt: [('Burn-', 'Burn'), ('-present', 'present'), ('-truth', 'truth'), ('proclama-', 'proclama'), ('can-', 'can'), ('-', ''), ('oversight--', 'oversight-'), ('to-', 'to'), ('dread-', 'dread'), ('-', '')]
PTAR1850XXXX-VXX-XX-page22.txt: [('-preservation', 'preservation'), ('-to', 'to'), ('spe-', 'spe'), ('Del-', 'Del')]
PTAR1850XXXX-VXX-XX-page23.txt: [('Bonn-', 'Bonn'), ('not."--', 'not."-'), ('witness-', 'witness'), ('Je-', 'Je'), ('counter-', 'counter'), ('conflagra-', 'conflagra'), ('be-', 'be'), ('-', ''), ('-in', 'in'), ('occur-', 'occur'), ('con-', 'con')]
PTAR1850XXXX-VXX-XX-page24.txt: [('wit-', 'wit'), ('wit-', 'wit'), ('-it.', 'it.'), ('-', ''), ('-t', 't')]
PTAR1850XXXX-VXX-XX-page25.txt: [('-', ''), ('--fill', '-fill'), ('christen-', 'christen'), ('predict-', 'predict')]
PTAR1850XXXX-VXX-XX-page26.txt: [('Ad-', 'Ad'), ('pre-', 'pre'), ('provi-', 'provi'), ('tar-', 'tar'), ('-', ''), ('-the', 'the'), ('trans-', 'trans')]
PTAR1850XXXX-VXX-XX-page27.txt: [('natural-', 'natural'), ('-ness', 'ness'), ('-him', 'him'), ('--of', '-of'), ('-', ''), ('believe.--', 'believe.-'), ('peo-', 'peo')]
PTAR1850XXXX-VXX-XX-page28.txt: [('-through', 'through'), ('tes-', 'tes'), ('-', ''), ('-', '')]
PTAR1850XXXX-VXX-XX-page29.txt: [('com-', 'com'), ('unwor-', 'unwor'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
PTAR1850XXXX-VXX-XX-page3.txt: [('giv-', 'giv'), ('ad-', 'ad'), ('move-', 'move'), ('al-', 'al')]
PTAR1850XXXX-VXX-XX-page30.txt: [('-', ''), ('-', ''), ('-', ''), ('Sab-', 'Sab'), ('-all', 'all'), ('Je-', 'Je')]
PTAR1850XXXX-VXX-XX-page31.txt: [('-', ''), ('the-', 'the'), ('-or', 'or'), ('ener-', 'ener'), ('-', ''), ('-on', 'on'), ('-Saviour', 'Saviour')]
PTAR1850XXXX-VXX-XX-page32.txt: [('-', ''), ('de-', 'de'), ('wil-', 'wil')]
PTAR1850XXXX-VXX-XX-page33.txt: [('-', ''), ('-world', 'world')]
PTAR1850XXXX-VXX-XX-page34.txt: [('histo-', 'histo'), ('-', ''), ('-a', 'a'), ('-attached', 'attached'), ('re-', 're'), ('ex-', 'ex'), ('-', '')]
PTAR1850XXXX-VXX-XX-page35.txt: [('anthe-', 'anthe'), ('representa-', 'representa'), ('ac-', 'ac'), ('C.-', 'C.'), ('af-', 'af'), ('-', ''), ('-failed.', 'failed.'), ('Ad-', 'Ad'), ('de-', 'de'), ('-vice', 'vice'), ('-it', 'it'), ('un-', 'un'), ('Command-', 'Command')]
PTAR1850XXXX-VXX-XX-page36.txt: [('experience-', 'experience'), ('--better', '-better'), ('-', ''), ('-of', 'of')]
PTAR1850XXXX-VXX-XX-page37.txt: [('-', ''), ('an-', 'an'), ('-lie', 'lie'), ('re-', 're'), ('can-', 'can'), ('-', ''), ('an-', 'an'), ('-', ''), ('-', ''), ('Isa.-', 'Isa.'), ('iii.-', 'iii.'), ('-', ''), ('-', ''), ('-', ''), ('coun-', 'coun'), ('repu-', 'repu'), ('re-', 're')]
PTAR1850XXXX-VXX-XX-page38.txt: [('-', ''), ('-', ''), ('wit-', 'wit'), ('-', '')]
PTAR1850XXXX-VXX-XX-page39.txt: [('-his', 'his'), ('unwil-', 'unwil'), ('-', ''), ('tradi-', 'tradi'), ('-', ''), ('-', ''), ('-was', 'was'), ('-to', 'to'), ('-', ''), ("-MILNER'S", "MILNER'S"), ('-', '')]
PTAR1850XXXX-VXX-XX-page4.txt: [('-', ''), ('--in', '-in'), ('how-', 'how')]
PTAR1850XXXX-VXX-XX-page40.txt: [('world-', 'world'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('cleans-', 'cleans'), ('-', ''), ('-', ''), ('Solo-', 'Solo')]
PTAR1850XXXX-VXX-XX-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('"-', '"'), ('-', '')]
PTAR1850XXXX-VXX-XX-page42.txt: [('-', ''), ('-', ''), ('.-', '.'), ('bet-', 'bet'), ('con-', 'con'), ('-', ''), ('.-', '.')]
PTAR1850XXXX-VXX-XX-page43.txt: [('of--', 'of-'), ('sanctified-', 'sanctified'), ('-I', 'I'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('individu-', 'individu'), ('Le-', 'Le'), ('atone-', 'atone'), ('-xiv', 'xiv')]
PTAR1850XXXX-VXX-XX-page44.txt: [('-', ''), ('accord.-', 'accord.'), ('-shall', 'shall'), ('hal-', 'hal'), ('-', ''), ('-', ''), ('-', ''), ('tres-', 'tres'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
PTAR1850XXXX-VXX-XX-page45.txt: [('-', ''), ('-', ''), ('-"', '"'), ('-the', 'the'), ('un-', 'un'), ('atone-', 'atone'), ('-', ''), ('Thurnmimlight-', 'Thurnmimlight'), ('atone-', 'atone'), ('-', ''), ('per-', 'per'), ('-the.', 'the.')]
PTAR1850XXXX-VXX-XX-page46.txt: [('trans-', 'trans'), ('-', ''), ('rebel-', 'rebel'), ('-', ''), ('-', ''), ('-S', 'S'), ('atone-', 'atone'), ('iniqui-', 'iniqui'), ('-', ''), ('atone-', 'atone'), ('"-', '"'), ('taber-', 'taber'), ('per-', 'per')]
PTAR1850XXXX-VXX-XX-page47.txt: [('-', ''), ('trans-', 'trans'), ('-Priest', 'Priest'), ('for-', 'for'), ('IIo-', 'IIo'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('cleans-', 'cleans'), ('Dispen-', 'Dispen')]
PTAR1850XXXX-VXX-XX-page48.txt: [('-', ''), ('-and', 'and'), ('the-', 'the'), ('-Just', 'Just'), ('-it', 'it'), ('-is', 'is'), ('-the', 'the')]
PTAR1850XXXX-VXX-XX-page49.txt: [('-', ''), ('Je-', 'Je'), ('un-', 'un'), ('-', ''), ('ini-', 'ini'), ('ful-', 'ful'), ('-', ''), ('unre-', 'unre'), ('sa-', 'sa'), ('-as', 'as'), ('himself.L---', 'himself.L--'), ('-', ''), ('He-', 'He')]
PTAR1850XXXX-VXX-XX-page5.txt: [('vari-', 'vari'), ('-', ''), ('oppo-', 'oppo'), ('-millennium', 'millennium')]
PTAR1850XXXX-VXX-XX-page50.txt: [('un-', 'un'), ('Won-', 'Won'), ('Sanc-', 'Sanc'), ('up-', 'up'), ('habited."-', 'habited."'), ('"Be-', '"Be'), ('-mesmer', 'mesmer'), ('-the', 'the'), ('-for', 'for'), ('scape-', 'scape'), ('-from', 'from')]
PTAR1850XXXX-VXX-XX-page51.txt: [('-page', 'page'), ('RE-', 'RE')]
PTAR1850XXXX-VXX-XX-page6.txt: [('pe-', 'pe'), ('com-', 'com')]
PTAR1850XXXX-VXX-XX-page7.txt: [('cer-', 'cer'), ('-', ''), ('-do', 'do'), ('Gam-', 'Gam'), ('re-', 're'), ("'Kock-", "'Kock"), ('pray-', 'pray'), ('-of', 'of'), ('-fallen.', 'fallen.'), ('thy-', 'thy'), ('-', '')]
PTAR1850XXXX-VXX-XX-page8.txt: [('pros--', 'pros-'), ('reach-', 'reach'), ('-', ''), ('-continual', 'continual'), ('Bal-', 'Bal')]
PTAR1850XXXX-VXX-XX-page9.txt: [('vir-', 'vir'), ('-very', 'very'), ('-', ''), ('-JUSTIFY', 'JUSTIFY'), ('con-', 'con'), ('jus-', 'jus'), ('-from', 'from'), ("-Lot's", "Lot's")]

Check Correction 3

In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction3

Average verified rate: 0.9599526551389643

Average of error rates: 0.039791304347826094

Total token count: 224734

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[("'", 565),
 ('th', 161),
 ('ch', 132),
 ('d', 117),
 ('n', 77),
 ('ex', 75),
 ('ver', 74),
 ('e', 71),
 ('t', 68),
 ('x', 58),
 ("'the", 56),
 ('m', 54),
 ('w', 53),
 ('ment', 46),
 ("the'", 45),
 ('r', 37),
 ('tion', 34),
 ('ly', 33),
 ('f', 27),
 ('g', 27),
 ('re', 25),
 ("'of", 23),
 ("and'", 20),
 ('ments', 20),
 ("'and", 18),
 ("to'", 18),
 ('eze', 17),
 ('ry', 17),
 ('br', 17),
 ('vt', 15),
 ('nant', 13),
 ("'to", 13),
 ('ful', 13),
 ('tuary', 12),
 ('tions', 12),
 ('es', 12),
 ('cy', 11),
 ('un', 11),
 ("'was", 11)]

Correction 4 -- Remove Extra Quotation Marks

In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
        print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
PTAR184907XX-V01-01-page3.txt: [("'these", 'these')]
PTAR184907XX-V01-01-page5.txt: [("'from", 'from')]
PTAR184907XX-V01-01-page7.txt: [("'violate", 'violate')]
PTAR184907XX-V01-01-page8.txt: [("'by", 'by')]
PTAR184907XX-V01-01-page9.txt: [("'of", 'of'), ("'keep", 'keep')]
PTAR184908XX-V01-02-page1.txt: [("'the", 'the')]
PTAR184908XX-V01-02-page8.txt: [("'act", 'act')]
PTAR184908XX-V01-03-page3.txt: [("'TILL", 'TILL')]
PTAR184908XX-V01-03-page7.txt: [("'gave", 'gave')]
PTAR184909XX-V01-04-page1.txt: [("'away", 'away')]
PTAR184909XX-V01-04-page4.txt: [("'that", 'that'), ("'present", 'present')]
PTAR184909XX-V01-04-page5.txt: [("'on", 'on')]
PTAR184912XX-V01-05-page2.txt: [("'and", 'and'), ("'recently", 'recently'), ("'Nov.", 'Nov.'), ("'impressed", 'impressed'), ("'Moser", 'Moser'), ("'of", 'of'), ("'the", 'the'), ("'into", 'into'), ("'him", 'him'), ("'the", 'the')]
PTAR184912XX-V01-05-page3.txt: [("'had", 'had'), ("'Me", 'Me'), ("'Rhodes", 'Rhodes'), ("'Itelpli", 'Itelpli'), ("'all.", 'all.'), ("'among", 'among'), ("'I", 'I'), ("'the", 'the'), ("'feared", 'feared'), ("'Bro.", 'Bro.'), ("'be.sPre", 'be.sPre'), ("'hy", 'hy')]
PTAR184912XX-V01-05-page4.txt: [("'should", 'should')]
PTAR184912XX-V01-05-page6.txt: [("'of", 'of')]
PTAR184912XX-V01-05-page8.txt: [("'WHITE", 'WHITE'), ("'my", 'my'), ("'You", 'You'), ("'closes", 'closes'), ("'on", 'on'), ("'isle", 'isle'), ("'Spirit", 'Spirit')]
PTAR184912XX-V01-06-page1.txt: [("'St.", 'St.')]
PTAR184912XX-V01-06-page2.txt: [("'perfectly", 'perfectly')]
PTAR184912XX-V01-06-page4.txt: [("'It", 'It'), ("'some", 'some'), ("'seeing", 'seeing'), ("'the", 'the')]
PTAR184912XX-V01-06-page5.txt: [("'of", 'of'), ("'to", 'to')]
PTAR184912XX-V01-06-page6.txt: [("'we", 'we')]
PTAR185003XX-V01-07-page2.txt: [("'vas", 'vas')]
PTAR185003XX-V01-07-page4.txt: [("'to", 'to')]
PTAR185003XX-V01-07-page5.txt: [("'the", 'the')]
PTAR185003XX-V01-07-page6.txt: [("'THE", 'THE'), ("'Gal.", 'Gal.'), ("'chapter", 'chapter'), ("'Aid", 'Aid'), ("'retired", 'retired'), ("'God", 'God'), ("'was", 'was'), ("'whe", 'whe'), ("'if", 'if'), ("'nine", 'nine'), ("'COMMAND", 'COMMAND'), ("'is", 'is'), ("'keeping", 'keeping'), ("'Wise", 'Wise')]
PTAR185003XX-V01-07-page7.txt: [("'.", '.'), ("'verse", 'verse'), ("'righteous", 'righteous'), ("'of", 'of'), ("'whO", 'whO'), ("'inder", 'inder'), ("'Services", 'Services'), ("'ofthe", 'ofthe'), ("'The", 'The'), ("'IjEATH", 'IjEATH'), ("'The", 'The'), ("'is", 'is'), ("'cern", 'cern'), ("'the", 'the')]
PTAR185003XX-V01-07-page8.txt: [("'covenant", 'covenant'), ("'the", 'the'), ("'.", '.')]
PTAR185003XX-V01-08-page1.txt: [("'one", 'one')]
PTAR185003XX-V01-08-page2.txt: [("'half", 'half'), ("'they", 'they'), ("'the", 'the'), ("'Me", 'Me'), ("'us", 'us'), ("'are", 'are'), ("'the", 'the'), ("'you", 'you')]
PTAR185003XX-V01-08-page3.txt: [("'of", 'of'), ("'ehang.", 'ehang.'), ("'to", 'to'), ("'.'eiti'the", '.eitithe'), ("'those", 'those'), ("'by", 'by'), ("'righteousness", 'righteousness')]
PTAR185003XX-V01-08-page4.txt: [("'above", 'above'), ("'One", 'One'), ('\'"', '"'), ("'was", 'was'), ("'and", 'and'), ("'the", 'the'), ("'be", 'be'), ("'can", 'can'), ("'elucidation", 'elucidation')]
PTAR185003XX-V01-08-page5.txt: [("'two", 'two'), ("'a", 'a')]
PTAR185003XX-V01-08-page6.txt: [("'Seventy", 'Seventy'), ("'of", 'of'), ("'the", 'the'), ("'A.", 'A.'), ("'or", 'or'), ("'seventieth", 'seventieth'), ("'is", 'is'), ("'chronology", 'chronology'), ("'a", 'a'), ("'or", 'or'), ("'one", 'one'), ("'of", 'of'), ("'of", 'of'), ("'then", 'then'), ("'point", 'point'), ("'why", 'why'), ("'were", 'were'), ("'until", 'until'), ("'given", 'given'), ("'but", 'but'), ("'true", 'true'), ("'chronology", 'chronology'), ("'Then", 'Then'), ("'the", 'the'), ("'the", 'the'), ("'the", 'the'), ("'commences.", 'commences.'), ("'Then", 'Then'), ("'confirmation", 'confirmation'), ("'Messiah.", 'Messiah.')]
PTAR185003XX-V01-08-page7.txt: [("'step", 'step')]
PTAR185003XX-V01-08-page8.txt: [("'TRUTH.", 'TRUTH.'), ("'AULT.", 'AULT.')]
PTAR185004XX-V01-09-page1.txt: [("'reject", 'reject')]
PTAR185004XX-V01-09-page2.txt: [("'the", 'the'), ("'message", 'message'), ("'God", 'God')]
PTAR185004XX-V01-09-page3.txt: [("''commandments", 'commandments'), ("'lived", 'lived'), ("'which", 'which'), ("'Pope", 'Pope'), ("'the", 'the')]
PTAR185004XX-V01-09-page4.txt: [("'God", 'God'), ("'patterns", 'patterns'), ("'an", 'an')]
PTAR185004XX-V01-09-page5.txt: [("'drink", 'drink')]
PTAR185004XX-V01-09-page6.txt: [("'that", 'that'), ("'his", 'his'), ("'our", 'our')]
PTAR185004XX-V01-09-page7.txt: [("'nights", 'nights'), ("'I", 'I'), ("'sacrifice", 'sacrifice')]
PTAR185005XX-V01-10-page2.txt: [("'jewels", 'jewels'), ("'months", 'months')]
PTAR185005XX-V01-10-page3.txt: [("'ten", 'ten'), ("'since", 'since')]
PTAR185005XX-V01-10-page4.txt: [("'are", 'are')]
PTAR185005XX-V01-10-page5.txt: [("'Sanctuary", 'Sanctuary')]
PTAR185005XX-V01-10-page7.txt: [("'and", 'and')]
PTAR185008XX-V01-01-page10.txt: [("'this", 'this'), ("'And", 'And'), ("'to", 'to'), ("'And", 'And')]
PTAR185008XX-V01-01-page11.txt: [("'ivork", 'ivork'), ("'it", 'it')]
PTAR185008XX-V01-01-page13.txt: [("'done", 'done'), ("'them", 'them'), ('\'should"be', 'should"be'), ("'old", 'old')]
PTAR185008XX-V01-01-page14.txt: [("'souls", 'souls')]
PTAR185008XX-V01-01-page15.txt: [("'We", 'We'), ("'forty", 'forty'), ("'by", 'by')]
PTAR185008XX-V01-01-page16.txt: [("'up", 'up'), ("'was", 'was')]
PTAR185008XX-V01-01-page2.txt: [("'to", 'to'), ("'.", '.')]
PTAR185008XX-V01-01-page3.txt: [("'each", 'each'), ("'their", 'their')]
PTAR185008XX-V01-01-page4.txt: [("'selfish", 'selfish')]
PTAR185008XX-V01-01-page6.txt: [("'was", 'was'), ("'for", 'for'), ("'I", 'I')]
PTAR185008XX-V01-01-page7.txt: [("'midnight", 'midnight'), ("'after", 'after'), ("'false", 'false')]
PTAR185008XX-V01-01-page9.txt: [("'is", 'is'), ("'answer", 'answer'), ("'Blessed", 'Blessed'), ("'to", 'to')]
PTAR185008XX-V01-02-page1.txt: [("'opening", 'opening'), ("'Upon", 'Upon'), ("'and", 'and'), ("'on", 'on'), ("'little", 'little'), ("'was", 'was'), ("'everyfthing", 'everyfthing'), ("'Then", 'Then'), ("'as", 'as'), ("'sympathies", 'sympathies')]
PTAR185008XX-V01-02-page10.txt: [("'around", 'around'), ("'they", 'they'), ("'will", 'will'), ("'All", 'All'), ("'edged", 'edged'), ("'being", 'being')]
PTAR185008XX-V01-02-page11.txt: [("'them", 'them'), ("'down", 'down')]
PTAR185008XX-V01-02-page12.txt: [("'to", 'to'), ("'There", 'There'), ("'be", 'be'), ("'of", 'of'), ("'Providence", 'Providence'), ("'vision", 'vision'), ("'in", 'in'), ("'the", 'the')]
PTAR185008XX-V01-02-page13.txt: [("'tarrying.", 'tarrying.'), ("'him", 'him')]
PTAR185008XX-V01-02-page14.txt: [("'rescripttire", 'rescripttire'), ("'question", 'question'), ("'despised.", 'despised.'), ("'fulfillment.", 'fulfillment.'), ("'were", 'were'), ("'by", 'by'), ("'for", 'for'), ("'Scipleship.", 'Scipleship.'), ("'from", 'from'), ("'if", 'if'), ("'or", 'or'), ("'all", 'all'), ("'prophecy", 'prophecy'), ("'supremacy", 'supremacy'), ("'whtCh", 'whtCh')]
PTAR185008XX-V01-02-page16.txt: [("'testimony", 'testimony'), ("'speak", 'speak'), ("'the", 'the'), ("'give", 'give'), ("'AllParts", 'AllParts'), ("'you", 'you'), ("'fastened", 'fastened'), ('\'Church."', 'Church."'), ("'a", 'a'), ("'Provil", 'Provil'), ("'Unworthy", 'Unworthy'), ("'Divine", 'Divine'), ("'speak", 'speak')]
PTAR185008XX-V01-02-page3.txt: [("'were", 'were'), ("'interwoven", 'interwoven'), ("''That", 'That'), ("'saw.", 'saw.'), ("'before", 'before'), ("'Perhaps", 'Perhaps'), ("'We", 'We'), ("'generation", 'generation'), ("'Christ", 'Christ'), ("'dear", 'dear'), ("'virgins", 'virgins'), ("'Matt..xxv", 'Matt..xxv'), ("'cry", 'cry'), ("'to", 'to'), ("'the", 'the')]
PTAR185008XX-V01-02-page4.txt: [("'or", 'or')]
PTAR185008XX-V01-02-page5.txt: [("'LAMP", 'LAMP'), ("'Many", 'Many'), ("'why", 'why'), ("'We", 'We'), ("'Strong", 'Strong'), ('\'Esdras."', 'Esdras."'), ("'they", 'they')]
PTAR185008XX-V01-02-page7.txt: [("'cannot", 'cannot'), ("'of", 'of'), ("'Gentile", 'Gentile'), ("'as", 'as'), ("'Little", 'Little'), ("'permission", 'permission'), ("'out", 'out'), ("'stupendous", 'stupendous'), ("'cold", 'cold')]
PTAR185008XX-V01-02-page8.txt: [("'record", 'record'), ("'litho", 'litho'), ("'of", 'of'), ("'with", 'with'), ("'roll", 'roll'), ("'the", 'the'), ("'we", 'we'), ("'the", 'the'), ("'If", 'If'), ("'the", 'the'), ("'and", 'and'), ("'scrap", 'scrap')]
PTAR185008XX-V01-02-page9.txt: [("'triumph", 'triumph'), ("'Providence", 'Providence'), ("'servants", 'servants')]
PTAR185009XX-V01-03-page1.txt: [("'the", 'the'), ("'of", 'of'), ('\'Word."', 'Word."')]
PTAR185009XX-V01-03-page10.txt: [("'be", 'be'), ("'falsehood", 'falsehood'), ("'single", 'single'), ("'conduct", 'conduct'), ("'Holy", 'Holy'), ("'hive", 'hive'), ("'second", 'second'), ("'asked", 'asked'), ("'motive", 'motive'), ("'perfect", 'perfect'), ("'but", 'but')]
PTAR185009XX-V01-03-page11.txt: [("'religious", 'religious'), ("'this", 'this')]
PTAR185009XX-V01-03-page12.txt: [("'understanding", 'understanding'), ("'patteins", 'patteins'), ("'Lev", 'Lev'), ("'bring", 'bring'), ("'thine", 'thine'), ("'Lord", 'Lord'), ("'the", 'the'), ("'quoted", 'quoted'), ("'the", 'the')]
PTAR185009XX-V01-03-page13.txt: [("'was", 'was')]
PTAR185009XX-V01-03-page14.txt: [("'self", 'self')]
PTAR185009XX-V01-03-page15.txt: [("'with", 'with')]
PTAR185009XX-V01-03-page16.txt: [("'future", 'future'), ("'in", 'in'), ("'the", 'the')]
PTAR185009XX-V01-03-page2.txt: [("'confirmed", 'confirmed'), ("'cannot", 'cannot')]
PTAR185009XX-V01-03-page3.txt: [("'ascend", 'ascend'), ("'They", 'They')]
PTAR185009XX-V01-03-page4.txt: [("'all", 'all'), ("'Judgment", 'Judgment'), ("'denying", 'denying')]
PTAR185009XX-V01-03-page5.txt: [("'the", 'the'), ("'after", 'after'), ("'prophecy.", 'prophecy.'), ("'These", 'These'), ("'cross", 'cross'), ("'Second", 'Second')]
PTAR185009XX-V01-03-page6.txt: [("'Hos", 'Hos')]
PTAR185009XX-V01-03-page7.txt: [("'God", 'God'), ("'coin", 'coin'), ("'falling", 'falling'), ("'trial", 'trial'), ("'scenes", 'scenes'), ("'CANNOT", 'CANNOT'), ("'Messi", 'Messi'), ("'.not", '.not'), ("'WILLING", 'WILLING'), ("'Ohrist'as", 'Ohristas'), ("'.slough", '.slough'), ("'the", 'the'), ("'have", 'have'), ("'is", 'is'), ("'Amen", 'Amen'), ("'the", 'the')]
PTAR185009XX-V01-03-page8.txt: [("'John's", 'Johns'), ("'has", 'has'), ("'of", 'of'), ("'cannot", 'cannot'), ("'Messiah.", 'Messiah.'), ("'who", 'who'), ("'wrong", 'wrong'), ("'Heaven", 'Heaven'), ("'that", 'that'), ("'being", 'being'), ("'King", 'King'), ("'right", 'right'), ("'prior", 'prior'), ("'the", 'the'), ("'hen", 'hen')]
PTAR185009XX-V01-03-page9.txt: [("'crisis", 'crisis'), ("'without", 'without'), ("'camp", 'camp'), ("'For", 'For'), ("'held", 'held'), ("'having", 'having'), ("'untoAle", 'untoAle'), ("'that", 'that'), ("'done", 'done'), ("'the", 'the'), ("'beneath", 'beneath'), ("'you.", 'you.'), ("'command", 'command'), ("'names", 'names'), ("'withgreat", 'withgreat'), ("'are", 'are')]
PTAR185009XX-V01-04-page1.txt: [("'years", 'years'), ("'shall", 'shall'), ("'could", 'could')]
PTAR185009XX-V01-04-page10.txt: [("'Holy", 'Holy')]
PTAR185009XX-V01-04-page11.txt: [("'These", 'These'), ("'the", 'the'), ("'Aaron", 'Aaron'), ("'was", 'was'), ("'the", 'the'), ("'bear", 'bear')]
PTAR185009XX-V01-04-page12.txt: [("'and", 'and'), ("'of", 'of'), ("'Upon", 'Upon')]
PTAR185009XX-V01-04-page13.txt: [("'entering", 'entering')]
PTAR185009XX-V01-04-page14.txt: [("'was", 'was'), ("'opinion", 'opinion'), ("'whether", 'whether'), ("'behind", 'behind'), ("'aplace", 'aplace')]
PTAR185009XX-V01-04-page16.txt: [("'Advent", 'Advent'), ("'the", 'the'), ("'their", 'their'), ("'N.", 'N.')]
PTAR185009XX-V01-04-page3.txt: [("'AND", 'AND'), ("'new", 'new'), ("'REIGN", 'REIGN')]
PTAR185009XX-V01-04-page4.txt: [("'the", 'the'), ("'their", 'their'), ("'His", 'His'), ("'And", 'And')]
PTAR185009XX-V01-04-page6.txt: [("'judgment", 'judgment'), ("'saying", 'saying'), ("'looking", 'looking'), ("'year", 'year'), ("'This", 'This'), ("'saying", 'saying'), ("'the", 'the'), ("'had", 'had')]
PTAR185009XX-V01-04-page7.txt: [("'We", 'We'), ("'To", 'To'), ("'have", 'have'), ("'Advent", 'Advent')]
PTAR185009XX-V01-04-page8.txt: [("'their", 'their'), ("'Be", 'Be'), ("'having", 'having'), ("'Philadelphia", 'Philadelphia'), ("'I", 'I')]
PTAR185009XX-V01-04-page9.txt: [("'foly", 'foly'), ("'which", 'which'), ("'first", 'first'), ("'Saviour", 'Saviour'), ("'it", 'it'), ("'we", 'we'), ("'he", 'he'), ("'his", 'his'), ("'ready", 'ready'), ("'believe", 'believe'), ("'of", 'of'), ("'After", 'After'), ("'And", 'And'), ("'that", 'that'), ("'he", 'he'), ("'Then", 'Then'), ("'in", 'in')]
PTAR185009XX-V01-EX-page10.txt: [("'tried", 'tried'), ("'that", 'that')]
PTAR185009XX-V01-EX-page11.txt: [("'othe.s", 'othe.s'), ("'restored", 'restored'), ("'the", 'the'), ("'dwell", 'dwell')]
PTAR185009XX-V01-EX-page12.txt: [("'hath", 'hath'), ("'willing", 'willing')]
PTAR185009XX-V01-EX-page13.txt: [("'because", 'because'), ("'and", 'and')]
PTAR185009XX-V01-EX-page14.txt: [("'the", 'the'), ('\'"', '"')]
PTAR185009XX-V01-EX-page15.txt: [("'died", 'died'), ("'according", 'according'), ("'a", 'a'), ("'the", 'the'), ("'Neither", 'Neither'), ("'law", 'law'), ("'was", 'was'), ("'declaring", 'declaring'), ("'is", 'is')]
PTAR185009XX-V01-EX-page16.txt: [("'lived", 'lived'), ("'restitution", 'restitution'), ("'the", 'the'), ("'there", 'there'), ("'the", 'the'), ("'caught", 'caught'), ("'the", 'the')]
PTAR185009XX-V01-EX-page17.txt: [("'keepcure", 'keepcure'), ("'HE", 'HE')]
PTAR185009XX-V01-EX-page2.txt: [("'gospel", 'gospel'), ("'Auditean", 'Auditean'), ("'v.", 'v.'), ("'God", 'God'), ("'most", 'most'), ("'requires", 'requires')]
PTAR185009XX-V01-EX-page3.txt: [("'and", 'and'), ("'general", 'general'), ("'tables", 'tables')]
PTAR185009XX-V01-EX-page4.txt: [("'account", 'account'), ("'before.", 'before.'), ("'urge", 'urge')]
PTAR185009XX-V01-EX-page5.txt: [("'them", 'them'), ("'roaring", 'roaring'), ("'of", 'of'), ("'See", 'See')]
PTAR185009XX-V01-EX-page6.txt: [("'moss", 'moss'), ("'raise", 'raise')]
PTAR185009XX-V01-EX-page7.txt: [("'GREAT", 'GREAT'), ("'Ahab", 'Ahab'), ("'sought", 'sought'), ("'Art", 'Art'), ("'Advent", 'Advent'), ("'of", 'of'), ("'preacher", 'preacher'), ("'exerciseth", 'exerciseth'), ("'Belau", 'Belau'), ("'to", 'to'), ("'his", 'his'), ("'be", 'be'), ("'Get", 'Get')]
PTAR185009XX-V01-EX-page8.txt: [("'to", 'to'), ("'fit", 'fit'), ("'of", 'of'), ("'altar", 'altar'), ("'WIFE.", 'WIFE.')]
PTAR185009XX-V01-EX-page9.txt: [("'roar", 'roar'), ("'And", 'And'), ("'darkening", 'darkening')]
PTAR185011XX-V01-05-page1.txt: [('\'"ittee', '"ittee'), ("'BATES.", 'BATES.'), ("'Babylon", 'Babylon'), ("'try", 'try'), ("'Jesus", 'Jesus')]
PTAR185011XX-V01-05-page2.txt: [("'calls", 'calls'), ("'voice", 'voice'), ("'given", 'given'), ("'simple", 'simple'), ("'of", 'of')]
PTAR185011XX-V01-05-page3.txt: [("'the", 'the'), ("'necessity", 'necessity'), ("'been", 'been'), ("'what", 'what')]
PTAR185011XX-V01-05-page4.txt: [("'udgment", 'udgment'), ("'city", 'city'), ("'difference", 'difference'), ("'fulfillment", 'fulfillment'), ("'heard", 'heard'), ("'seemed", 'seemed')]
PTAR185011XX-V01-05-page5.txt: [("'virgins", 'virgins'), ("'point", 'point'), ("'denied", 'denied'), ("'courage", 'courage'), ("'was", 'was'), ("'before", 'before'), ("'of", 'of'), ("'to", 'to'), ("'better", 'better'), ("'late", 'late'), ("'been", 'been')]
PTAR185011XX-V01-05-page6.txt: [("'And", 'And'), ("'Holy", 'Holy')]
PTAR185011XX-V01-05-page7.txt: [("'King", 'King'), ("'it", 'it'), ("'had", 'had')]
PTAR185011XX-V01-05-page8.txt: [("'of", 'of'), ("'mine", 'mine'), ("'from", 'from'), ("'Spirit", 'Spirit'), ("'interesting", 'interesting'), ("'thirst", 'thirst'), ("'feed", 'feed'), ("'shall", 'shall'), ("'Of'God", 'OfGod'), ("'river", 'river'), ("'from", 'from'), ("'GO.d", 'GO.d'), ("'and", 'and'), ("'the", 'the')]
PTAR185011XX-V01-11-page1.txt: [("'the", 'the')]
PTAR185011XX-V01-11-page3.txt: [("'but", 'but'), ("'how", 'how')]
PTAR185011XX-V01-11-page4.txt: [("'ever", 'ever')]
PTAR185011XX-V01-11-page5.txt: [("'BRO.", 'BRO.')]
PTAR185011XX-V01-11-page6.txt: [("'Jesus", 'Jesus'), ("'clearly", 'clearly'), ("'of", 'of'), ("'his", 'his'), ("'faces", 'faces')]
PTAR185011XX-V01-11-page7.txt: [("'that", 'that'), ("'Gentiles", 'Gentiles')]
PTAR185011XX-V01-11-page8.txt: [("'There", 'There'), ("'learned", 'learned')]
PTAR1850XXXX-VXX-XX-page10.txt: [("'BLISS", 'BLISS'), ("'is", 'is'), ("'answer", 'answer'), ("'every", 'every'), ("'Blessed", 'Blessed')]
PTAR1850XXXX-VXX-XX-page11.txt: [("'He", 'He'), ("'And", 'And')]
PTAR1850XXXX-VXX-XX-page12.txt: [("'and", 'and')]
PTAR1850XXXX-VXX-XX-page13.txt: [("'Voice", 'Voice')]
PTAR1850XXXX-VXX-XX-page16.txt: [("'Upon", 'Upon')]
PTAR1850XXXX-VXX-XX-page19.txt: [("'connection", 'connection'), ("'rifled.", 'rifled.'), ("'why", 'why'), ("'We", 'We')]
PTAR1850XXXX-VXX-XX-page23.txt: [("'did", 'did')]
PTAR1850XXXX-VXX-XX-page27.txt: [("'forget", 'forget'), ("'t", 't')]
PTAR1850XXXX-VXX-XX-page29.txt: [("'believed", 'believed')]
PTAR1850XXXX-VXX-XX-page3.txt: [("'course", 'course'), ("'GOD", 'GOD')]
PTAR1850XXXX-VXX-XX-page30.txt: [("'himself", 'himself'), ("'me", 'me'), ("'Lord", 'Lord')]
PTAR1850XXXX-VXX-XX-page31.txt: [("'heart", 'heart'), ("'meek", 'meek')]
PTAR1850XXXX-VXX-XX-page32.txt: [("'designs.", 'designs.')]
PTAR1850XXXX-VXX-XX-page34.txt: [("'atching", 'atching'), ("'the", 'the')]
PTAR1850XXXX-VXX-XX-page35.txt: [("'Covenant", 'Covenant'), ("'Amen.", 'Amen.'), ("'trust", 'trust'), ("'the", 'the')]
PTAR1850XXXX-VXX-XX-page36.txt: [("'Behold", 'Behold'), ("'present", 'present')]
PTAR1850XXXX-VXX-XX-page37.txt: [("'mai", 'mai'), ("'f'ound", 'found'), ("'Heaven", 'Heaven')]
PTAR1850XXXX-VXX-XX-page38.txt: [("'cannot", 'cannot'), ("'Devlish", 'Devlish')]
PTAR1850XXXX-VXX-XX-page39.txt: [("'unto", 'unto'), ("'that", 'that'), ("'If", 'If'), ("'Verily", 'Verily'), ("'that", 'that'), ("'The", 'The'), ("'a", 'a'), ("'Holy", 'Holy')]
PTAR1850XXXX-VXX-XX-page4.txt: [("'shall", 'shall'), ("'as", 'as'), ("'first", 'first')]
PTAR1850XXXX-VXX-XX-page40.txt: [("'Ex.", 'Ex.'), ("'was", 'was')]
PTAR1850XXXX-VXX-XX-page43.txt: [("'The", 'The')]
PTAR1850XXXX-VXX-XX-page44.txt: [("'Ver.", 'Ver.')]
PTAR1850XXXX-VXX-XX-page45.txt: [("'their", 'their')]
PTAR1850XXXX-VXX-XX-page46.txt: [("'.", '.')]
PTAR1850XXXX-VXX-XX-page48.txt: [("'place", 'place'), ("'but", 'but'), ("'door", 'door'), ("'my", 'my')]
PTAR1850XXXX-VXX-XX-page49.txt: [("'Iv", 'Iv'), ("'other", 'other')]
PTAR1850XXXX-VXX-XX-page50.txt: [("'will", 'will')]
PTAR1850XXXX-VXX-XX-page6.txt: [("'key", 'key'), ("'matter.", 'matter.'), ("'day", 'day')]
PTAR1850XXXX-VXX-XX-page7.txt: [("'were", 'were'), ("'only", 'only'), ("'Kock", 'Kock'), ("'exclaimed", 'exclaimed'), ("'will", 'will'), ("'in", 'in'), ("'Ninevah's", 'Ninevahs'), ("'he", 'he'), ("'that", 'that'), ("'kingdom", 'kingdom')]

Check Correction 4

In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction4

Average verified rate: 0.9628746245411058

Average of error rates: 0.03694782608695653

Total token count: 224725

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[("'", 543),
 ('th', 161),
 ('ch', 132),
 ('d', 117),
 ('n', 79),
 ('ex', 76),
 ('ver', 75),
 ('e', 71),
 ('t', 69),
 ('x', 58),
 ('m', 54),
 ('w', 53),
 ('ment', 46),
 ("the'", 45),
 ('r', 37),
 ('tion', 35),
 ('ly', 33),
 ('f', 28),
 ('g', 27),
 ('re', 26),
 ('ments', 21),
 ("and'", 20),
 ("to'", 18),
 ('eze', 17),
 ('br', 17),
 ('ry', 17),
 ('vt', 15),
 ('nant', 13),
 ('ful', 13),
 ('tuary', 12),
 ('tions', 12),
 ('es', 12),
 ('cy', 11),
 ('un', 11)]

Correction 5 -- Rejoin Split Words

In [25]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
PTAR184907XX-V01-01-page5.txt: [('COV', 'ENANT')]
PTAR184908XX-V01-02-page8.txt: [('pre', 'cious')]
PTAR184908XX-V01-03-page2.txt: [('LOV', 'ED')]
PTAR184908XX-V01-03-page8.txt: [('th', 'at')]
PTAR184909XX-V01-04-page3.txt: [('DISPER', 'SIONS')]
PTAR184912XX-V01-05-page1.txt: [('IMMORTALI', 'TY'), ('IMMOR', 'TALITY'), ('th', 'em')]
PTAR184912XX-V01-05-page2.txt: [('JUSTIFI', 'CATION'), ('inte', 'rest')]
PTAR184912XX-V01-05-page6.txt: [('th', 'e')]
PTAR184912XX-V01-05-page8.txt: [('TR', 'UTH')]
PTAR184912XX-V01-06-page2.txt: [('TABER', 'NACLE')]
PTAR184912XX-V01-06-page7.txt: [('eigh', 't')]
PTAR185003XX-V01-07-page4.txt: [('TA', 'BLES')]
PTAR185003XX-V01-07-page5.txt: [('Tes', 'tament')]
PTAR185003XX-V01-07-page6.txt: [('fal', 'len')]
PTAR185003XX-V01-07-page7.txt: [('em', 'braced')]
PTAR185003XX-V01-08-page1.txt: [('noth', 'ing')]
PTAR185003XX-V01-08-page3.txt: [('re', 'main')]
PTAR185003XX-V01-08-page4.txt: [('sanctua', 'ry')]
PTAR185003XX-V01-08-page5.txt: [('HEAV', 'ENLY'), ('circum', 'stances')]
PTAR185003XX-V01-08-page7.txt: [('PEO', 'PLE'), ('PLE', 'A')]
PTAR185004XX-V01-09-page2.txt: [('MESSA', 'GE')]
PTAR185004XX-V01-09-page5.txt: [('Ex', 'tra')]
PTAR185005XX-V01-10-page1.txt: [('pre', 'pare')]
PTAR185005XX-V01-10-page2.txt: [('ta', 'Me')]
PTAR185005XX-V01-10-page4.txt: [('IMA', 'GES'), ('HEAV', 'ENS')]
PTAR185008XX-V01-01-page11.txt: [('Mor', 'iah')]
PTAR185008XX-V01-01-page13.txt: [('TA', 'N')]
PTAR185008XX-V01-01-page16.txt: [('co', 'worker')]
PTAR185008XX-V01-01-page2.txt: [('oc', 'H')]
PTAR185008XX-V01-01-page6.txt: [('re', 'SEARCHING')]
PTAR185008XX-V01-01-page8.txt: [('EX', 'PECT')]
PTAR185008XX-V01-01-page9.txt: [('th', 'or'), ('ment', 'on')]
PTAR185008XX-V01-02-page1.txt: [('Vo', 'L'), ('saTsu', 'MA')]
PTAR185008XX-V01-02-page10.txt: [('unim', 'portant')]
PTAR185008XX-V01-02-page14.txt: [('re', 'adjust'), ('confe', 'sSing'), ('confes', 's'), ('concep', 'tion')]
PTAR185008XX-V01-02-page16.txt: [('Ged', 'or')]
PTAR185008XX-V01-02-page5.txt: [('coun', 't')]
PTAR185008XX-V01-02-page7.txt: [("ham'", 's')]
PTAR185009XX-V01-03-page11.txt: [('blasphe', 'ming')]
PTAR185009XX-V01-03-page15.txt: [('od', 'is')]
PTAR185009XX-V01-03-page2.txt: [('un', 'impaired')]
PTAR185009XX-V01-03-page6.txt: [('un', 'It')]
PTAR185009XX-V01-03-page7.txt: [("Christ'", 's')]
PTAR185009XX-V01-04-page15.txt: [('th', 'At'), ('al', 'ways')]
PTAR185009XX-V01-04-page6.txt: [('re', 'arouse')]
PTAR185009XX-V01-04-page7.txt: [('re', 'examining'), ('th', 'e')]
PTAR185009XX-V01-04-page9.txt: [('zi', 'n')]
PTAR185009XX-V01-EX-page11.txt: [('th', 'y'), ('Itt', 'a')]
PTAR185009XX-V01-EX-page2.txt: [('unnecessa', 'ry'), ('fa', 'th')]
PTAR185009XX-V01-EX-page5.txt: [('Egy', 'ptians')]
PTAR185011XX-V01-05-page1.txt: [('th', 'at')]
PTAR185011XX-V01-11-page3.txt: [('OB', 'SERVED')]
PTAR185011XX-V01-11-page4.txt: [('sus', 'I')]
PTAR185011XX-V01-11-page5.txt: [('PR', 'ES'), ('th', 'at')]
PTAR1850XXXX-VXX-XX-page10.txt: [('th', 'or'), ('ment', 'on')]
PTAR1850XXXX-VXX-XX-page13.txt: [('Cincin', 'nati')]
PTAR1850XXXX-VXX-XX-page18.txt: [('carryin', 'g')]
PTAR1850XXXX-VXX-XX-page24.txt: [('tr', 'ek'), ('Ers', 'E')]
PTAR1850XXXX-VXX-XX-page25.txt: [('plen', 'ty'), ('obei', 'sance')]
PTAR1850XXXX-VXX-XX-page28.txt: [('re', 'adjust')]
PTAR1850XXXX-VXX-XX-page3.txt: [('giv', 'en')]
PTAR1850XXXX-VXX-XX-page31.txt: [('co', 'exist')]
PTAR1850XXXX-VXX-XX-page35.txt: [('un', 'It')]
PTAR1850XXXX-VXX-XX-page37.txt: [('re', 'I')]
PTAR1850XXXX-VXX-XX-page38.txt: [('FAI', 'L')]
PTAR1850XXXX-VXX-XX-page43.txt: [('ex', 'press')]
PTAR1850XXXX-VXX-XX-page46.txt: [('TI', 's')]
PTAR1850XXXX-VXX-XX-page50.txt: [('th', 'At'), ('al', 'ways')]
PTAR1850XXXX-VXX-XX-page51.txt: [('RE', 'In')]
PTAR1850XXXX-VXX-XX-page7.txt: [('re', 'SEARCHING'), ('cer', 'tainty')]

Check Correction 5

In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction5

Average verified rate: 0.9632425599344794

Average of error rates: 0.03657391304347826

Total token count: 224662

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[("'", 543),
 ('th', 155),
 ('ch', 132),
 ('d', 117),
 ('n', 78),
 ('ver', 75),
 ('ex', 73),
 ('e', 68),
 ('t', 67),
 ('x', 58),
 ('m', 54),
 ('w', 53),
 ("the'", 45),
 ('ment', 44),
 ('r', 37),
 ('tion', 34),
 ('ly', 33),
 ('f', 28),
 ('g', 27),
 ('ments', 21),
 ("and'", 20),
 ('re', 18),
 ("to'", 18),
 ('eze', 17),
 ('br', 17),
 ('ry', 17),
 ('vt', 15),
 ('nant', 13),
 ('ful', 13),
 ('tuary', 12),
 ('tions', 12),
 ('cy', 11),
 ('es', 11)]

Correction 6 -- Rejoin Split Words II

In [28]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
PTAR184907XX-V01-01-page3.txt: [('PER', 'PETUAL'), ('COM', 'MANDMENTS')]
PTAR184907XX-V01-01-page4.txt: [('cove', 'nant'), ('COVE', 'NANT')]
PTAR184907XX-V01-01-page5.txt: [('COMMAND', 'MENTS')]
PTAR184908XX-V01-02-page3.txt: [('MIN', 'ISTRATION')]
PTAR184908XX-V01-02-page7.txt: [('LAW', 'FUL')]
PTAR184908XX-V01-02-page8.txt: [('pre', 'cious')]
PTAR184908XX-V01-03-page4.txt: [('COM', 'MANDMENTS')]
PTAR184912XX-V01-05-page1.txt: [('th', 'em')]
PTAR184912XX-V01-05-page2.txt: [('constant', 'ly')]
PTAR184912XX-V01-05-page8.txt: [('TR', 'UTH')]
PTAR185003XX-V01-07-page2.txt: [('COM', 'MANDMENTS')]
PTAR185003XX-V01-07-page3.txt: [('sin', 'gular')]
PTAR185003XX-V01-07-page4.txt: [('TA', 'BLES'), ('cove', 'nant'), ('COMMAND', 'MENTS')]
PTAR185003XX-V01-07-page6.txt: [('COMMAND', 'MENTS')]
PTAR185003XX-V01-07-page7.txt: [('Minis', 'tration'), ('MINIS', 'TRATION')]
PTAR185003XX-V01-08-page3.txt: [('govern', 'ment')]
PTAR185003XX-V01-08-page4.txt: [('sanctua', 'ry'), ('apart', 'ment')]
PTAR185003XX-V01-08-page7.txt: [('con', 'fidence')]
PTAR185004XX-V01-09-page7.txt: [('do', 'minical')]
PTAR185005XX-V01-10-page4.txt: [('A', 'fter')]
PTAR185005XX-V01-10-page7.txt: [('At', 'tains')]
PTAR185008XX-V01-01-page9.txt: [('the', 're')]
PTAR185008XX-V01-02-page1.txt: [('saTsu', 'MA'), ('ben', 'eath')]
PTAR185008XX-V01-02-page10.txt: [('unim', 'portant')]
PTAR185008XX-V01-02-page11.txt: [('s', 'ated')]
PTAR185008XX-V01-02-page6.txt: [('so', 'dding')]
PTAR185009XX-V01-03-page14.txt: [('of', 'fering')]
PTAR185009XX-V01-03-page7.txt: [('reason', 'ers'), ('do', 'ubt'), ('Chris', 'tian')]
PTAR185009XX-V01-04-page1.txt: [('JUDG', 'MENT')]
PTAR185009XX-V01-04-page14.txt: [('a', 'sa')]
PTAR185009XX-V01-04-page6.txt: [('and', 're')]
PTAR185009XX-V01-EX-page11.txt: [('COMMAND', 'MENTS')]
PTAR185009XX-V01-EX-page13.txt: [('and', 're'), ('gath', 'ering'), ('WON', 'DERS')]
PTAR185009XX-V01-EX-page16.txt: [('the', 're')]
PTAR185009XX-V01-EX-page2.txt: [('AND', 'RE'), ('c', 'haracter'), ('back', 'slidings'), ('con', 'fidence')]
PTAR185009XX-V01-EX-page3.txt: [('com', 'mandments')]
PTAR185009XX-V01-EX-page5.txt: [('time', 'ly'), ('no', 'es')]
PTAR185009XX-V01-EX-page7.txt: [('d', 'estined')]
PTAR185011XX-V01-05-page2.txt: [('united', 'ly')]
PTAR185011XX-V01-05-page7.txt: [('set', 'tled')]
PTAR185011XX-V01-11-page1.txt: [('P', 'UB')]
PTAR185011XX-V01-11-page3.txt: [('maids', 'ervant')]
PTAR185011XX-V01-11-page4.txt: [('I', 're'), ('a', 'cer')]
PTAR185011XX-V01-11-page7.txt: [('lite', 'ral')]
PTAR185011XX-V01-11-page8.txt: [('love', 'ly'), ('dis', 'ciples')]
PTAR1850XXXX-VXX-XX-page13.txt: [('Cincin', 'nati')]
PTAR1850XXXX-VXX-XX-page2.txt: [('m', 'itt')]
PTAR1850XXXX-VXX-XX-page25.txt: [('obei', 'sance')]
PTAR1850XXXX-VXX-XX-page33.txt: [('do', 'ubtful')]
PTAR1850XXXX-VXX-XX-page39.txt: [('b', 'eep')]
PTAR1850XXXX-VXX-XX-page43.txt: [('ha', 'th'), ('of', 'fering')]
PTAR1850XXXX-VXX-XX-page45.txt: [('reconcile', 'ment')]
PTAR1850XXXX-VXX-XX-page49.txt: [('a', 'sa')]
PTAR1850XXXX-VXX-XX-page5.txt: [('be', 'ri')]
PTAR1850XXXX-VXX-XX-page7.txt: [('cer', 'tainty'), ('a', 'cer')]

Check Correction 6

In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction6

Average verified rate: 0.9634385240062687

Average of error rates: 0.03627391304347827

Total token count: 224608

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[("'", 543),
 ('th', 154),
 ('ch', 132),
 ('d', 116),
 ('n', 78),
 ('ver', 75),
 ('ex', 73),
 ('e', 68),
 ('t', 67),
 ('x', 58),
 ('m', 53),
 ('w', 53),
 ("the'", 45),
 ('ment', 43),
 ('r', 37),
 ('tion', 34),
 ('ly', 31),
 ('f', 28),
 ('g', 27),
 ("and'", 20),
 ("to'", 18),
 ('eze', 17),
 ('ments', 17),
 ('br', 17),
 ('ry', 17),
 ('vt', 15),
 ('re', 13),
 ('nant', 12),
 ('ful', 12),
 ('tuary', 12),
 ('tions', 12),
 ('cy', 11)]

Review Remaining Errors

In [31]:
reports.docs_with_high_error_rate(summary)
Out[31]:
[]
In [32]:
reports.long_errors(errors_summary)
Out[32]:
(['holichasten',
  'entreatfellow',
  'the-commandments',
  'whicircontained',
  'unrighteousimpressions',
  'commandinents',
  'couragement',
  'scriptuthou',
  "discovered'",
  'resurection',
  'hearejected',
  'incorparating',
  "instituted'",
  'fulfilltnent',
  'contrabecause',
  'constantorder',
  'blasphemeia',
  'wateradrenehed',
  'thelanguage',
  'experiencwithout',
  'acqaintance',
  "thebeliever's",
  'opinequally',
  'aldisturbance',
  'peoclaiming',
  'theirjudgment',
  'constantmeeting',
  'accomparadox',
  'wasitttreduced',
  'drunken-harlot',
  "earth'-wherein",
  'forguidance',
  'immeliately',
  "prophecies'",
  'sciiptiires',
  'hvbelieving',
  'ahaortlained',
  'occacross-bearing',
  'commencepectations',
  "warclownie'r",
  "adventists'",
  'expericially',
  'congregution',
  'indionation',
  'everlusting',
  'and-expected',
  'seventlimonth',
  'positivelydeciared',
  'howgdaiamthat',
  'wriversiort',
  'sabbath-day',
  'christiansin',
  'imporshould',
  'deliverence',
  'forgotstruction',
  'bescattering',
  'testimocomprehend',
  'atonesanctuary',
  'meneorialwhich',
  'isvautitagy',
  'thereloreit',
  'chastenings',
  "eustachius's",
  "the'atonement",
  "became'acquainted",
  "we'establish",
  'coneersions',
  'knovvleclgp',
  'everyfthing',
  'congrenakedness',
  'ofunfaithful',
  'commandmentmust',
  'truitettutiearro',
  'semi-monthly',
  'testamentorder',
  'rhinistratian',
  'connectedeand',
  'confotinded',
  'opirtienethat',
  'proclafused',
  'rtghteousness',
  'hand-writing',
  'manifesttake',
  'obstapressly',
  'semi-monthly---by',
  "september'and",
  "calculated'",
  'thetseventhday',
  'romangenerid',
  'hovedensays',
  'leavitigtheir',
  'hascorrupted',
  'abominationsof',
  'teneommandments',
  'thattheangel',
  'inpreparation',
  'noonewhohasreadthe',
  'inforination',
  "creition't'",
  'inlookingiorthis',
  'mesmerfathers',
  'fustlighton',
  'politicoreligious',
  'avoidfinally',
  'purunbelievers',
  'knowledgeto',
  'exereiselts',
  'fairhatution',
  "durable'ric",
  'badacheareth',
  'tlieseventh',
  'blasphemeus',
  'lhtetiodhogitnacal',
  'evensingular',
  'covegreater',
  "'afterwards'",
  'sinoffering',
  'commandmentd',
  "themselves'",
  'chitshowing',
  'father-inprepared',
  'ransoineeloia',
  'wonder-working',
  'irresistable',
  'inconceivprofessed',
  'freighttures',
  'infrodaction',
  "ofswine'stlesb",
  'disorgapizers',
  'fulfilvictory',
  'thenewtestament',
  'interferende',
  'fhultsandsins',
  'prefeelings',
  'the-earthly',
  'twoinsthutions',
  'incessantbath',
  'intercedthen',
  'assemperished',
  "jerusalein's",
  'accuratainment',
  'prereflecting',
  'inprovidence',
  'underheavenly',
  'rescripttire',
  'thepreaence',
  'terproclamation',
  'nsvowientheir',
  'felloviship',
  'certainlyis',
  'finishincorporated',
  "sabbathday's",
  'utterwinter',
  'jointheirship',
  'expreslamps',
  'reviewshould',
  'knowlothers',
  'tmmediately',
  'thereuntoperfect',
  'maintaincifies',
  'gabrietrefers',
  'neweovettant',
  'twoministrations',
  'wiroverturn',
  'therresurre',
  'eighthehapters',
  'breast-plate',
  'fergirtness',
  "tuary'comprehened",
  'somejoyment',
  'therrianner',
  'sancenclosed',
  'soul-thrilling',
  'histbriclestimony',
  'deprofessedly',
  'offeringsffoorr',
  'scripfessing',
  'ciiiitetlit',
  'proviprophetic',
  'somehonestly',
  'theoparliai',
  'sacenelosed',
  'cominandments',
  'fififiliment',
  'atprophetic',
  'arscripture',
  "passengers'",
  'sabbathdays',
  'frornthataifs',
  'continstood',
  'commiandnient',
  'cowmenquence',
  'developement',
  'censtired-and',
  'greatandnotabledayefthe',
  'perfectioifilment',
  'nectionbetween',
  'eommandnients',
  "other'errors",
  'propheivord',
  'congregamake',
  'trailsgressions',
  'hzitokingbbantidtlatrtitvet',
  'trespassoffering',
  'publishment',
  'preparatoryscenes',
  "t'tshayts'err",
  'shouhclread',
  "behoovecl'him",
  'tresspassed',
  'coniktetion',
  'ccunmaridnients',
  'atonepresent',
  'sabbathbreaking',
  'affecslumbers',
  'preciliation',
  'cronologers',
  'perioditire',
  'unconqueraing',
  'commandlation',
  'tellsfisilial',
  'witnessedthe',
  'thefoundation',
  'cornmandments',
  'spperstition',
  'anftrttering',
  'paperssince',
  'sunplesiewr',
  'slaveholding',
  'sincommentators',
  'setipttires',
  'no-commandment',
  'conflagracorrupted',
  'cdritmencententat',
  'putrifaction',
  'allimportant',
  "angers'message",
  'anotheresteemeth',
  'eityroafill',
  'protabernacle',
  'xpeciatiens',
  'righteousto',
  'fulfillxxiv',
  "declaration'",
  'concluinfluence',
  'unavoidaing',
  'onlyapparent',
  'aftefchrist',
  'nearlyevery',
  'councounterfeit',
  'terminatwave',
  'thottirenant',
  'ourpeetings',
  'congrecrati',
  'thrgiveness',
  'thedoetrine',
  'infidelstleny',
  'acknowledgeand',
  'meetwhatever',
  'importurned',
  'admonisealing',
  'scripresurrection',
  'concepthemselves',
  'spirrepresented',
  'convincedthat',
  'guidseasons',
  'fruitofthearticles',
  '---language',
  'onuonizavnt',
  'pronotinced',
  'thenecessity',
  "my'suffering",
  'eammandment',
  'orcongregation',
  'neartherighttime',
  "exciting'oecasion",
  'calculatiens',
  'mahammedans',
  'conaccomplishing',
  'uncircumeision',
  'neliuzhadnezzar',
  'thopeculiai',
  'meetsolemhly',
  'confidencein',
  'extravigances',
  'appropripassage',
  'ectridttatesrien',
  'uninformation',
  'disappomted',
  'revelerstand',
  'catareceive',
  'fulmovements',
  'considerfrom',
  "proclaimed'",
  'offerplaces',
  'bejustified',
  'ifinnediately',
  'philadelchildren',
  'thatsitteth',
  'sixteenthehapter',
  'midnightcry',
  "separavah's",
  'colsoiation',
  'yourconsecration',
  'univittingly',
  'fountairrof',
  'munsonville',
  'soul-purifying',
  'dispeisation',
  'publicaprepared',
  'illusguests',
  'iswerrettaired',
  'bridedisappointed',
  'proviexistence',
  'tnisconceived',
  'chromildgycif',
  'ascertainterpositions',
  "representing'",
  'pbrasenlogy',
  'preachevent',
  'thoughtseine',
  'direcdinibted',
  'apfurnishes',
  'cliurchl-that',
  'rightmanifest',
  'haveesreriep',
  'anomplished',
  'thoroughlyhealed',
  'bedeliberate',
  'instrticties',
  'rerepresent',
  'especiallythose',
  'testifytutto',
  'preparatoing',
  'reproach-of',
  'forlittoith',
  'rentainethi',
  'comtnencing',
  'tribulatiom',
  'plagueamong',
  'forgivenesi',
  'congregatiou',
  'ratifollowing',
  'trtbuicition',
  'movetherefore',
  'symbolicallyderusalem',
  'philadelphi',
  'medo-persia',
  'ishmaelitish',
  'terminatein',
  'incorporatedit',
  'self-deceived',
  "his'brethren",
  'derangewhich',
  'forgiveignorance',
  'commadments',
  "revelation'",
  'agitationand',
  'accomplishthose',
  'sin-offerings',
  "abandon'our",
  'conlingering',
  'ourbackelidings',
  'evidentpossible',
  'atonecording',
  'oomniandmettet',
  'rootnedoffspring',
  'understandpromises',
  'soul-destroying',
  'messageswere',
  'compreheaded',
  'towardsthem',
  'specifiviously',
  'confiverted',
  'typiidolators',
  'becausethese',
  'notnetiftei',
  'coreetterrox',
  'eveningsacrifice',
  'thatieventy',
  'withmetehhuunnttupbio',
  'precludethe',
  'break-plate',
  'somepointment',
  'faappointnient',
  'thatthoumayest',
  'decreclaimed',
  'retraditions',
  'still-rests',
  'regumarriage',
  'fulfillrested',
  "by'succession",
  'comtnandinent',
  'manministry',
  'initiseventh',
  'minusustain',
  'pecessaryto',
  'fallibilithese',
  "be'assigned",
  'acceptatrodden',
  'intheforenoon',
  'scape-goats',
  'differoneebetween',
  'offerthousand',
  'positiveinstitution',
  'demonstraentered',
  'comfaithful',
  'consistedin',
  'institutioni',
  'astonishthe',
  'oththerefitie',
  'improvethent',
  "associated'",
  'seeinhabitants',
  "'alstachius",
  'tarmovement',
  'mornprepared',
  "bampfield's",
  'blinduessin',
  'coivimandments',
  'spruce-street',
  'christenthe',
  'disasterons',
  'desolatidns',
  'archeumstances',
  "selves'shall",
  'meetcreased',
  'readprayers',
  'accuradence',
  'experiencso',
  'confidenttion',
  "oursaviour's",
  'forgivenesslif',
  'sevcaptivity',
  "the'question",
  'encourainfluence',
  'sublunagroom',
  'whichportance',
  "chronolog'ers",
  'demenstrates',
  'enragacceptance',
  'beenrepresented',
  'fnlfidinent',
  'thelridegroom',
  'comreconcile',
  'dominhistory',
  'personagesappear',
  'comtgandlitents',
  'thasacritice',
  "and'reminds",
  'cleantermination',
  'commandhave',
  'greathurden',
  'aeiseveration',
  'serighteous',
  'hisposition',
  'presumptous',
  'ecelesiastical',
  'everblessfoes',
  'thispassageicor',
  "confirmarrangements'",
  'therestitution',
  'nebitchadneasar',
  'belongingito',
  'fulfillnient',
  'distingnished',
  'consequenge',
  'passconfessed',
  'deceivedland',
  'deaexpressions',
  'thoumystery',
  'awakenrubbish',
  'jeconverted',
  'engngetiness',
  'mernconkssed',
  'expectrimmed',
  'revelationk',
  'tirtisatenol',
  'commandrhents',
  'circumstanble',
  'subsectnent',
  'provfarther',
  "ou'rblessed",
  'itevidently',
  'prevideneenever',
  'saerifising',
  'communiprep',
  'ofreligious',
  'wholesystem',
  'image-beast',
  "s'uappcolsed",
  'their-iniquities',
  'reconciliapass',
  'unchangable',
  'threatnings',
  "found'within",
  'ndperfectfulfilment',
  'paramunications',
  'eterlsatnddyi',
  'combiaiadments',
  "irkreasing'",
  'sancbeareth',
  'thesanctuary',
  'wildernessof',
  'colsolation',
  'adventtause',
  'wicked-shall',
  "andhavingha'dthemwashedawayha",
  'mentionwhich',
  'cleanssomething',
  'yourassertion',
  'thefulfillment',
  'exarriining',
  'counterfeitprovidence',
  'performinot',
  'sabbathkeeping',
  'providencebe',
  'witdescribed',
  'translasabbath',
  'prophecyand',
  'destructiou',
  'cornquestion',
  "close'eonnection",
  'hauglitiness',
  'righteoussaid',
  'pprelielicied',
  'everyvestige',
  'confessionof',
  'thescriptures',
  'thatbelieve',
  'tiiinacqeas',
  'back-slidden',
  'baknowledge',
  'troubviilttake',
  'countersalvation',
  'derangecovenant',
  'clamorabout',
  'seventhchapter',
  'dorninicalday',
  'eontemtranslator',
  'soul-stirring',
  'repronounce',
  'evidencethat-he',
  'instituticin',
  'theininistration',
  'parighteous',
  'transgressorsunder',
  'whichtrodden',
  'faithfulwitness',
  'jerusawrought',
  'commandmente',
  'sublunairksome',
  'creawitnessed',
  'desirableday',
  'iriterferred',
  "endeavoring'",
  'throtighlim',
  'rejectedthis',
  "conviction'",
  'oodpromised',
  'ailwasidsome',
  'scrippeople',
  'badtheeffect',
  'thetnselves',
  'abbathintothefirst',
  'circumcisedin',
  'aldiscovered',
  'importinfinthefigfiliment',
  'disapbelievers',
  'resurreetion',
  'beth-shemesh',
  'isolatevdent',
  'prejlidices',
  'orsalvation',
  'fuladvoeated',
  'procamation',
  'tdeincrgenses',
  'ceritreport',
  'preparationt',
  'cargiveness',
  'commandmesntof',
  'prodoctrine',
  'blessidentifid',
  'backfulfilled',
  'knowlcross-bearing',
  'theroselies',
  'inconceivagainst',
  'ancleentinued',
  'circumstanthe',
  'thronestast',
  'terminatioa',
  'testimoniesin',
  'wasnailekto',
  "expressive'",
  'interpositon',
  'startingspointot',
  "vather'airight",
  'englandyniton',
  'ungressions',
  'sabanointed',
  "jesus'lovely",
  'assuranceis',
  'sanetifieth',
  'crucifixipti',
  'thatinoment',
  'wilderblood',
  'occaministry',
  'shuntpossible',
  'teachconceptions',
  'scapederful',
  'sublime-thought',
  'sotereignty',
  'offerpowerful',
  'predicttime',
  'saebalthrsalelitwactstruspr',
  'fifty-eighth',
  'recencilitheir',
  "macknight's",
  'andrespectable',
  'nterethiaestbe',
  'attendwithout',
  'theabominations',
  'steppinginto',
  'particubath',
  'commandnient',
  'choinhabitants',
  'sanctuaryof',
  'descendeorrune',
  'tetclielden',
  'revealdence',
  'fiemembranze',
  'wellthatthey',
  'burnwaiting',
  'definitetime',
  'witthsrised',
  'andijeliverance',
  'tistisriony',
  'righteousor',
  'ofcultivation',
  'stitchregard',
  'pereconciled',
  'disaolution',
  'merchrndise',
  'fontainbletm',
  'iitterifice',
  'guidthelonb',
  'cornforfing',
  'fearlesssummation',
  "his'daughter",
  'havingrejected',
  'tiventyrfive',
  'gyochtottize',
  'nebuchadnezar',
  'building--many',
  'backalidings',
  'eraphatithe',
  'ordinancesof',
  'arititypical',
  'wouldanswer',
  'denyingthat',
  'siinplicity',
  'theinterpretation',
  'humilliation',
  'motradiction',
  "shalt'bring",
  'pretendthat',
  'ourdisappointthent',
  'abankindred',
  'scatteringt',
  'thurnmimlight',
  'pheiscourtette',
  'ceintaining',
  'inteligence',
  'thbeitiannhato',
  'commandmentand',
  'intersessor',
  'desoliation',
  'confidentbe',
  'jesuslaught',
  'fulfilledat',
  'hventreyiew',
  'immormoment',
  'aupperaarepared',
  'propiiesyings',
  'christjetoil',
  'crucifixstill',
  'sabanimosities',
  'censurlieve',
  "description'cam",
  'ittandmenti',
  'righteousues',
  'expomodesty',
  'miscontending',
  'publithroughout',
  'anchshoutings',
  'ourdeatlaike',
  'pourtuguese',
  'ethinpnaocr',
  'reignoffered',
  'objectionis',
  'amuiriciacould',
  'causrighteous',
  'revealedywill',
  'resurrdction',
  'presanctifying',
  'retnrnedhome',
  'magdeburgenses',
  'simplicityof',
  'idolwiththe',
  "definition'",
  'hiscleansing',
  'laocliceans',
  'tabernatuary',
  'ofjudiement',
  'thispromise',
  'perfecttabernacle',
  "god'sblessing",
  "will'require",
  'undoubtedthe',
  'examiningthe',
  'justifihath',
  "tabernaand'",
  'abolishished',
  "doubt'after",
  'stumbling-blocks',
  'reprophetic',
  'fulfillrhent',
  'aposaccustomed',
  'accordagencies',
  'desirthough',
  'praccurrence',
  'minietration',
  'judgfurniture',
  'interferred',
  'standingcorn',
  "cvrtaiuly'it",
  'chronologers',
  'proclaimthemselves',
  'counterfeitand',
  'forlteeping',
  'asupplement',
  'wordfortheniselves',
  'ournaistake',
  'theyunderstand',
  'therumbling',
  'conprophecies',
  'theytweltteiohuenuerioint',
  'timeiinstead',
  'conelusionof',
  'decentlyand',
  'reconipense',
  'tarryrefine',
  'confessiontehacksliddn',
  'commandedto',
  'fearlesshas',
  'adsomething',
  "whereged'speople",
  'whethagainst',
  'meetmerchants',
  'solethnized',
  "professed't",
  'ministrationof',
  'applyingthese',
  "reedived'froin",
  'lordpalsied',
  'distracttturn',
  'destruilion',
  'seientlfday'],
 10)

Correction 7 -- Separate Squashed Words

In [33]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
    maxword = max(len(x) for x in sorted_list_of_words)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 15:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
PTAR184909XX-V01-04-page6.txt: [('positiveinstitution', 'positive institution')]
PTAR184912XX-V01-05-page1.txt: [('transgressorSunder', 'transgressor S under')]
PTAR184912XX-V01-05-page2.txt: [('thoroughlyhealed', 'thoroughly healed')]
PTAR184912XX-V01-05-page3.txt: [('theinterpretation', 'the interpretation')]
PTAR184912XX-V01-05-page8.txt: [('greatandnotabledayefthe', 'great and not a b led a ye f t h e')]
PTAR185003XX-V01-07-page6.txt: [('Wordfortheniselves', 'Word for then i selves'), ('anotheresteemeth', 'another esteemeth')]
PTAR185003XX-V01-07-page7.txt: [('positivelydecIared', 'positively de c I a red'), ('twoministrations', 'two ministration s'), ('theininistration', 'the in in i st r a t i o n')]
PTAR185003XX-V01-08-page3.txt: [('personagesappear', 'person ages appear')]
PTAR185003XX-V01-08-page4.txt: [('cleantermination', 'clean termination')]
PTAR185003XX-V01-08-page6.txt: [('startingspointot', 'start ing s point o t')]
PTAR185003XX-V01-08-page7.txt: [('perfectioifilment', 'perfect i o if i l men t')]
PTAR185004XX-V01-09-page7.txt: [('abbathintothefirst', 'a b bath into the first')]
PTAR185005XX-V01-10-page3.txt: [('Noonewhohasreadthe', 'No one who has read the')]
PTAR185008XX-V01-01-page3.txt: [('commencepectations', 'commence p e c t a t i o n s')]
PTAR185008XX-V01-01-page5.txt: [('conaccomplishing', 'con accomplishing')]
PTAR185008XX-V01-02-page14.txt: [('ourdiSappointthent', 'our di S a p point then t')]
PTAR185008XX-V01-02-page16.txt: [('preparatoryscenes', 'preparatory scenes')]
PTAR185008XX-V01-02-page3.txt: [('Bridedisappointed', 'Bride disappointed'), ('inlookingiorthis', 'in looking i or this')]
PTAR185008XX-V01-02-page9.txt: [('countersalvation', 'counter salvation')]
PTAR185009XX-V01-03-page11.txt: [('politicoreligious', 'p o lit i c o religious')]
PTAR185009XX-V01-03-page13.txt: [('perfecttabernacle', 'perfect tabernacle')]
PTAR185009XX-V01-03-page4.txt: [('inconceivprofessed', 'in c once iv professed')]
PTAR185009XX-V01-03-page9.txt: [('yourconsecration', 'your consecration'), ('histbriclestiMony', 'hi st b r i c l e s t i M o n y')]
PTAR185009XX-V01-04-page8.txt: [('Philadelchildren', 'P hi l ad el children')]
PTAR185009XX-V01-04-page9.txt: [('trespassoffering', 'trespass offering')]
PTAR185009XX-V01-EX-page10.txt: [('LhtetiodhogitnAcal', 'L h t e t i o d h o g i t n A c a l')]
PTAR185009XX-V01-EX-page13.txt: [('proclaimthemselves', 'proclaim themselves')]
PTAR185009XX-V01-EX-page8.txt: [('eveningsacrifice', 'evening sacrifice')]
PTAR185011XX-V01-05-page7.txt: [('symbolicallyderusalem', 'symbol i call y der us a lem')]
PTAR185011XX-V01-11-page4.txt: [('encourainfluence', 'en c our a influence')]
PTAR1850XXXX-VXX-XX-page13.txt: [('fruitofthearticles', 'fruit of the articles')]
PTAR1850XXXX-VXX-XX-page21.txt: [('fearlesssummation', 'fear less sum m a t i o n')]
PTAR1850XXXX-VXX-XX-page26.txt: [('ascertainterpositions', 'as certain ter positions')]
PTAR1850XXXX-VXX-XX-page28.txt: [('concepthemselves', 'c once p themselves'), ('teachconceptions', 'teach conceptions')]
PTAR1850XXXX-VXX-XX-page31.txt: [('understandpromises', 'understand promises')]
PTAR1850XXXX-VXX-XX-page34.txt: [('inconceivagainst', 'in c once iv against')]
PTAR1850XXXX-VXX-XX-page37.txt: [('counterfeitprovidence', 'counterfeit providence')]
PTAR1850XXXX-VXX-XX-page43.txt: [('thereuntoperfect', 'thereunto perfect')]
PTAR1850XXXX-VXX-XX-page44.txt: [('trespassoffering', 'trespass offering'), ('forgiveignorance', 'for give ignorance')]
PTAR1850XXXX-VXX-XX-page6.txt: [('conaccomplishing', 'con accomplishing')]

Check Correction 7

In [34]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction7

Average verified rate: 0.963552162714036

Average of error rates: 0.03613913043478261

Total token count: 224787

In [35]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[35]:
[("'", 543),
 ('th', 154),
 ('ch', 132),
 ('d', 117),
 ('n', 83),
 ('t', 79),
 ('ver', 75),
 ('ex', 73),
 ('e', 72),
 ('x', 58),
 ('m', 55),
 ('w', 53),
 ("the'", 45),
 ('ment', 43),
 ('r', 39),
 ('tion', 34),
 ('ly', 31),
 ('f', 29),
 ('g', 28),
 ("and'", 20),
 ("to'", 18),
 ('eze', 17),
 ('ments', 17),
 ('br', 17),
 ('ry', 17),
 ('vt', 15),
 ('re', 13),
 ('nant', 12),
 ('ful', 12),
 ('tuary', 12),
 ('tions', 12),
 ('cy', 11)]
In [ ]: