PTAR-OCR-Evaluation-and-Correction
In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt",
"2016-12-07-SDA-place-names.txt",
"2016-12-08-SDA-Vocabulary.txt",
"2017-01-03-place-names.txt",
"2017-02-14-Base-Word-List-SCOWL&KJV.txt",
"2017-02-14-Roman-Numerals.txt",
"2017-03-01-Additional-Approved-Words.txt"
]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "PTAR"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)
Baseline¶
In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/baseline Average verified rate: 0.9258265879793642 Average of error rates: 0.0755695652173913 Total token count: 228923
In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 30 )
Out[11]:
[('-', 612), ("'", 515), ('ñ', 232), (')', 230), ('th', 161), ('ch', 128), ('re-', 124), ('be-', 121), (']', 114), ('d', 113), ('¥', 110), ('com-', 97), ('con-', 97), ('ment', 92), ('tion', 84), ('n', 72), ('ver', 71), ('ly', 65), ('in-', 65), ('ex', 64), ('e', 62), ('x', 60), ('un-', 58), ('t', 57), ('*', 57), ('sab-', 56), ('de-', 56), ('ex-', 55), ('m', 54), ("'the", 54), ('w', 51), ('an-', 45), ("the'", 44), ('pro-', 44), ('ments', 43), ('ad-', 40), ('_', 39), ('the-', 39), ('r', 35), ('ñthe', 35), ('command-', 35), ('dis-', 35), ('pre-', 34), ('mandments', 34), ('-the', 32)]
Check Special Character Use¶
In [12]:
reports.tokens_with_special_characters(errors_summary)[:50]
Out[12]:
[('ñ', 232), (')', 230), (']', 114), ('¥', 110), ('*', 57), ('_', 39), ('ñthe', 35), ('(the', 26), ('(see', 24), ('[the', 21), ('(', 19), ('saysñ', 18), ('(or', 17), ('[', 14), ('(ps', 11), ('ñsee', 11), ('(margin', 10), ('ô', 10), ('ñthat', 10), ('(which', 10), ('[margin', 10), ('ñps', 8), ('/', 8), ('[rev', 8), ('[or', 8), ('(i', 8), ('(for', 7), ('[letter', 7), ('=', 7), ('(though', 7), ('[no', 7), ('(rev', 6), ('[see', 6), ('ñand', 6), ("ñ'", 6), ('ñto', 6), ('holies]', 6), ('cryñ', 5), ('(and', 5), ('saidñ', 5), ('(as', 5), ('ñwe', 5), ('[in', 5), ('(in', 5), ('(heb', 5), ('truthñthe', 4), ('worldña', 4), ('(to', 4), ('willñthat', 4), ('it)', 4)]
Correction 1 -- Normalize Characters¶
In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
# Substitute for all other dashes
content = re.sub(r"—-—–‑", r"-", content)
# Substitute formatted apostrophe
content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
# Replace all special characters with a space (as these tend to occur at the end of lines)
content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
Check Correction 1¶
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction1 Average verified rate: 0.9364051306513091 Average of error rates: 0.06533913043478261 Total token count: 228509
In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 621), ("'", 550), ('th', 162), ('ch', 131), ('re-', 124), ('be-', 121), ('d', 116), ('com-', 97), ('con-', 97), ('ment', 92), ('tion', 87), ('n', 77), ('ver', 74), ('ex', 70), ('e', 70), ('ly', 67), ('in-', 65), ('t', 64), ('x', 60), ('un-', 58), ('sab-', 56), ('de-', 56), ('ex-', 55), ("'the", 55), ('m', 54), ('w', 52), ('an-', 45), ("the'", 44), ('pro-', 44), ('ments', 43), ('ad-', 42), ('the-', 39), ('r', 35), ('mandments', 35), ('command-', 35), ('dis-', 35), ('pre-', 34), ('-the', 32), ('per-', 31), ('atone-', 30), ('ry', 29), ('--', 27), ('f', 27), ('tuary', 27), ('je-', 26), ('ble', 25), ('g', 25), ('ple', 25), ('tions', 24), ('mercy-seat', 23)]
Correction 2 -- Correct Line Endings¶
In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
Check Correction 2¶
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction2 Average verified rate: 0.9506618639049302 Average of error rates: 0.0509304347826087 Total token count: 224593
In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 617), ("'", 550), ('th', 161), ('ch', 131), ('d', 116), ('n', 76), ('ver', 73), ('ex', 70), ('e', 70), ('t', 64), ('x', 58), ("'the", 55), ('m', 54), ('w', 52), ("the'", 44), ('ment', 43), ('r', 35), ('tion', 34), ('ly', 33), ('-the', 32), ('--', 27), ('f', 27), ('g', 25), ('mercy-seat', 24), ("'of", 23), ("and'", 20), ('ments', 20), ('sabbath-day', 19), ('scape-goat', 19), ("'and", 18), ('ry', 17), ("to'", 17), ('br', 17), ('eze', 15), ('vt', 15), ('-of', 14), ("'to", 13), ('re-', 13), ('nant', 13), ('-in', 12), ('tuary', 12), ('the-', 12), ('tions', 12), ('ful', 12), ('sabbath-days', 11), ('com-', 11), ("'was", 11), ('shut-door', 11), ('-and', 11), ('con-', 11)]
Correction 3 -- Remove Extra Dashes¶
In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
replacements = []
for token in tokens:
if token[0] is "-":
replacements.append((token, token[1:]))
elif token[-1] is "-":
replacements.append((token, token[:-1]))
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
PTAR184907XX-V01-01-page1.txt: [('-II', 'II'), ('-', '')] PTAR184907XX-V01-01-page3.txt: [('-', ''), ('COM-', 'COM'), ('PER-', 'PER'), ('-', '')] PTAR184907XX-V01-01-page4.txt: [('-danger', 'danger'), ('-of', 'of'), ('COVE-', 'COVE'), ('God."-', 'God."'), ('-', ''), ('-', ''), ('-', ''), ('-two', 'two')] PTAR184907XX-V01-01-page5.txt: [('-', ''), ('COM-', 'COM'), ('COV-', 'COV'), ('COMMAND-', 'COMMAND'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('COVE-', 'COVE'), ('-', '')] PTAR184907XX-V01-01-page6.txt: [('-', ''), ('-', ''), ('BOND-', 'BOND'), ('-', '')] PTAR184907XX-V01-01-page8.txt: [('-', ''), ('-', ''), ('no-', 'no')] PTAR184907XX-V01-01-page9.txt: [('-this', 'this'), ('the-', 'the')] PTAR184908XX-V01-02-page1.txt: [('-', ''), ('-', ''), ('-', '')] PTAR184908XX-V01-02-page2.txt: [('-', '')] PTAR184908XX-V01-02-page3.txt: [('-', ''), ('MIN-', 'MIN')] PTAR184908XX-V01-02-page4.txt: [('-', '')] PTAR184908XX-V01-02-page5.txt: [('maen-', 'maen'), ('-', ''), ('-', '')] PTAR184908XX-V01-02-page6.txt: [('GUILT-', 'GUILT'), ('-', ''), ('RELAX-', 'RELAX'), ('hy-', 'hy')] PTAR184908XX-V01-02-page7.txt: [('LAW-', 'LAW'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] PTAR184908XX-V01-02-page8.txt: [('pre-', 'pre'), ('TRANS-', 'TRANS'), ('of-', 'of')] PTAR184908XX-V01-03-page1.txt: [('no-', 'no'), ('continu--', 'continu-')] PTAR184908XX-V01-03-page2.txt: [('-', ''), ('-the', 'the'), ('LOV-', 'LOV'), ('-', ''), ('-in', 'in')] PTAR184908XX-V01-03-page3.txt: [('-', ''), ('-', ''), ('command-', 'command'), ('-whom', 'whom')] PTAR184908XX-V01-03-page4.txt: [('COM-', 'COM'), ('-', ''), ('-', ''), ('-religion', 'religion')] PTAR184908XX-V01-03-page6.txt: [('-the', 'the'), ('-art', 'art'), ('-the', 'the'), ('-over', 'over'), ('-', '')] PTAR184909XX-V01-04-page1.txt: [('OP-', 'OP'), ('Sab-', 'Sab'), ('observ-', 'observ')] PTAR184909XX-V01-04-page2.txt: [('-in', 'in'), ('-', ''), ('-', '')] PTAR184909XX-V01-04-page3.txt: [('DISPER-', 'DISPER')] PTAR184909XX-V01-04-page4.txt: [('-', '')] PTAR184909XX-V01-04-page5.txt: [('-were', 'were')] PTAR184909XX-V01-04-page6.txt: [('-', ''), ('commem-', 'commem'), ('requir-', 'requir'), ('peo-', 'peo')] PTAR184909XX-V01-04-page7.txt: [('-', ''), ('-', ''), ('-if.', 'if.'), ('Sisters--', 'Sisters-')] PTAR184912XX-V01-05-page1.txt: [('-', ''), ('perform--', 'perform-'), ('-', ''), ('-', ''), ('-written', 'written'), ('gospel."-', 'gospel."'), ('-', ''), ('-of', 'of'), ('IMMOR-', 'IMMOR'), ('-atonement', 'atonement')] PTAR184912XX-V01-05-page2.txt: [('past-', 'past'), ('-exceedingly', 'exceedingly'), ('-.interesting', '.interesting'), ('-', ''), ('is-', 'is'), ('con-', 'con'), ('JUSTIFI-', 'JUSTIFI'), ('---', '--'), ('--', '-'), ('truth-', 'truth'), ('--the', '-the'), ('-SA', 'SA'), ('of-', 'of'), ('-', ''), ('-felt.', 'felt.')] PTAR184912XX-V01-05-page3.txt: [('-vision.', 'vision.'), ('-has', 'has'), ('-', ''), ('-and', 'and'), ('the-', 'the'), ('-days', 'days'), ('-', ''), ('-', ''), ('Con-', 'Con'), ('wilder-', 'wilder'), ('settle-', 'settle'), ('-', ''), ('-Sister', 'Sister'), ('-this', 'this'), ('-or', 'or'), ('-Sitter', 'Sitter'), ('..thought-', '..thought'), ('myfeelings-', 'myfeelings'), ('-was', 'was'), ('-', ''), ('-Nom', 'Nom'), ('-ta', 'ta'), ('-to', 'to')] PTAR184912XX-V01-05-page4.txt: [('-', '')] PTAR184912XX-V01-05-page5.txt: [('-', ''), ('prepara-', 'prepara'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] PTAR184912XX-V01-05-page6.txt: [('them.-', 'them.'), ('-bowels', 'bowels')] PTAR184912XX-V01-05-page7.txt: [('globe.--', 'globe.-'), ('--I', '-I'), ('of-', 'of')] PTAR184912XX-V01-05-page8.txt: [('I.-', 'I.'), ('that-', 'that'), ('-', ''), ('-', ''), ('n-', 'n')] PTAR184912XX-V01-06-page1.txt: [('-', ''), ('-take', 'take')] PTAR184912XX-V01-06-page2.txt: [('-Also', 'Also'), ('-', ''), ('TABER-', 'TABER'), ('-', ''), ('-', '')] PTAR184912XX-V01-06-page3.txt: [('-the', 'the'), ('-in', 'in'), ('-', ''), ('-', ''), ('-', ''), ('the-', 'the'), ('-is', 'is'), ('-', '')] PTAR184912XX-V01-06-page4.txt: [('-those', 'those'), ('-this', 'this'), ('BE-', 'BE'), ('-procure', 'procure'), ('-If', 'If'), ('-lbregning', 'lbregning')] PTAR184912XX-V01-06-page5.txt: [('-', ''), ('-', '')] PTAR184912XX-V01-06-page6.txt: [('-in', 'in'), ('persecu-', 'persecu'), ('-the', 'the'), ('-In', 'In')] PTAR184912XX-V01-06-page7.txt: [('interest-', 'interest'), ('dollar-', 'dollar'), ('-', '')] PTAR184912XX-V01-06-page8.txt: [('malice.-', 'malice.'), ('-', ''), ('-why', 'why'), ('-regard', 'regard')] PTAR185003XX-V01-07-page1.txt: [('-with', 'with'), ('WCT-', 'WCT'), ('REST-', 'REST'), ('REST-', 'REST'), ('-who', 'who'), ('-', '')] PTAR185003XX-V01-07-page2.txt: [('-', ''), ('Sabbath-', 'Sabbath'), ('-', ''), ('COMMAND-', 'COMMAND'), ('COM-', 'COM'), ('con-', 'con')] PTAR185003XX-V01-07-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-even', 'even'), ('-for', 'for'), ('--our', '-our'), ('-people', 'people'), ('-', ''), ('-', '')] PTAR185003XX-V01-07-page4.txt: [('COMMAND-', 'COMMAND'), ('-', ''), ('-', ''), ('-', ''), ('cove-', 'cove'), ('-', ''), ('EN-', 'EN'), ('-', ''), ('-', ''), ('DE-', 'DE'), ('fall.-', 'fall.'), ('-because', 'because')] PTAR185003XX-V01-07-page5.txt: [('-with', 'with'), ('--Cor.', '-Cor.'), ('-', '')] PTAR185003XX-V01-07-page6.txt: [('-The', 'The'), ('-every', 'every'), ('"-', '"'), ('-"law', '"law'), ('of-', 'of'), ("'Gal.-", "'Gal."), ('-', ''), ('-we', 'we'), ('ThelairOf.-', 'ThelairOf.'), ('-', ''), ('-sin', 'sin'), ('-', ''), ('-then.we', 'then.we'), ("-the'", "the'"), ('-warning', 'warning'), ('-', ''), ('-the', 'the'), ('-plain', 'plain'), ('verses-', 'verses'), ('COMMAND-', 'COMMAND'), ('-', '')] PTAR185003XX-V01-07-page7.txt: [('-earth', 'earth'), ('-', ''), ('--was', '-was'), ('of--', 'of-'), ('Jesusfor--', 'Jesusfor-'), ('-atonement', 'atonement'), ('-can', 'can'), ('-', ''), ('-the', 'the'), ('-Law', 'Law'), ('-"', '"'), ('-Cor.', 'Cor.'), ('-', ''), ('-the', 'the'), ('yptsp-', 'yptsp'), ('-', ''), ('.-', '.'), ('execu-', 'execu'), ('-..co', '..co'), ('-hut', 'hut'), ('-or', 'or'), ('-by', 'by'), ('-the', 'the'), ('-with', 'with'), ('new.-', 'new.'), ('-', ''), ('RIGHTEOUS-', 'RIGHTEOUS'), ('-', ''), ('-exposiqon', 'exposiqon'), ('-', ''), ('-God', 'God'), ('Moses.-', 'Moses.'), ('-', ''), ('condem.-', 'condem.'), ('MINIS-', 'MINIS'), ('--', '-'), ('de-', 'de'), ('the-', 'the'), ('-sin', 'sin'), ('-neither', 'neither'), ('-NOUTALITY.', 'NOUTALITY.'), ('-and', 'and'), ('-the', 'the'), ('"-.-', '"-.'), ('-holy', 'holy'), ('JUSTI-', 'JUSTI'), ('-', ''), ('righteousness"-', 'righteousness"'), ('-MORE.', 'MORE.'), ('which-', 'which'), ('-never.', 'never.'), ('away."-', 'away."')] PTAR185003XX-V01-07-page8.txt: [('-', ''), ('-', ''), ('-believe', 'believe'), ('-', ''), ('-', ''), ('-will', 'will'), ('explana-', 'explana'), ('-who', 'who')] PTAR185003XX-V01-08-page1.txt: [('-week', 'week'), ('-', ''), ('-said', 'said')] PTAR185003XX-V01-08-page2.txt: [('--For', '-For'), ('-expostulates', 'expostulates')] PTAR185003XX-V01-08-page3.txt: [('-consequenge.ire', 'consequenge.ire'), ('-"There', '"There'), ('re-', 're'), ('passion.L-', 'passion.L'), ('end-', 'end'), ('-In', 'In'), ('iron.-', 'iron.'), ('-', ''), ('Point-', 'Point'), ('govern-', 'govern'), ('father-', 'father'), ('-', ''), ('-', ''), ('-', '')] PTAR185003XX-V01-08-page4.txt: [('-', ''), ('-marginal', 'marginal'), ('-Here', 'Here'), ('sanctua-', 'sanctua'), ('-', ''), ('-sins', 'sins'), ('else-', 'else'), ('-', ''), ('in-', 'in')] PTAR185003XX-V01-08-page5.txt: [('HEAV-', 'HEAV')] PTAR185003XX-V01-08-page6.txt: [('-', ''), ("'A.-", "'A."), ('-commandment', 'commandment'), ('-', ''), ('a-', 'a'), ('-six', 'six'), ('infi-', 'infi'), ('-', ''), ('other-', 'other'), ('"Watchnit-', '"Watchnit'), ('-', ''), ('-connected', 'connected'), ('-sanctuary', 'sanctuary'), ('di-', 'di'), ('-of', 'of')] PTAR185003XX-V01-08-page7.txt: [('-of', 'of'), ('of-', 'of'), ('watchfulness.-', 'watchfulness.'), ('PEO-', 'PEO'), ('-Gentile', 'Gentile'), ('-be', 'be'), ('bless-', 'bless'), ('con-', 'con'), ('tho-', 'tho'), ('under-', 'under')] PTAR185003XX-V01-08-page8.txt: [('-immediately', 'immediately')] PTAR185004XX-V01-09-page1.txt: [('-', ''), ('-still', 'still'), ('-', '')] PTAR185004XX-V01-09-page2.txt: [('-not', 'not'), ('-', ''), ('-"If', '"If'), ('-', ''), ('-', ''), ('fallen"--', 'fallen"-')] PTAR185004XX-V01-09-page3.txt: [('KEEP-', 'KEEP'), ('-a', 'a'), ('--', '-'), ('against-', 'against')] PTAR185004XX-V01-09-page5.txt: [('-which', 'which'), ('-', ''), ('-', '')] PTAR185004XX-V01-09-page6.txt: [('-', ''), ('forlIttoith-', 'forlIttoith'), ('-that', 'that'), ('bring-', 'bring')] PTAR185004XX-V01-09-page7.txt: [('-was', 'was'), ('-they', 'they'), ('do-', 'do'), ('-', '')] PTAR185004XX-V01-09-page8.txt: [('-and', 'and'), ('field."--', 'field."-'), ('-My', 'My'), ('-', ''), ('-therefore', 'therefore')] PTAR185005XX-V01-10-page1.txt: [('--', '-'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('beauti-', 'beauti')] PTAR185005XX-V01-10-page2.txt: [('-', ''), ('ta-', 'ta'), ('-very', 'very'), ('-', ''), ('-"', '"'), ('New-', 'New'), ('ever-', 'ever'), ('-', ''), ('-', ''), ('reproach-', 'reproach'), ('-but', 'but'), ('-', ''), ('rut-', 'rut')] PTAR185005XX-V01-10-page3.txt: [('be-', 'be'), ('pre-', 'pre'), ('in.--', 'in.-'), ('-', ''), ('-to', 'to'), ('over-', 'over'), ('-Matt.', 'Matt.'), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', '')] PTAR185005XX-V01-10-page4.txt: [('-people', 'people'), ('HEAV-', 'HEAV'), ('IMA-', 'IMA'), ('-', ''), ('-', '')] PTAR185005XX-V01-10-page5.txt: [('-above', 'above'), ('-power', 'power')] PTAR185005XX-V01-10-page6.txt: [('-clearly', 'clearly'), ('-', ''), ('-', ''), ('-', ''), ('bride-', 'bride')] PTAR185005XX-V01-10-page7.txt: [('-dealt', 'dealt'), ('treacher-', 'treacher'), ('-', ''), ('-living', 'living'), ('Lord-', 'Lord'), ('-', ''), ('-', ''), ('Rev.-', 'Rev.'), ('-', ''), ('-', ''), ('-', ''), ('ad-', 'ad')] PTAR185005XX-V01-10-page8.txt: [('-by', 'by')] PTAR185008XX-V01-01-page1.txt: [('-', '')] PTAR185008XX-V01-01-page10.txt: [('wick-', 'wick'), ('-', '')] PTAR185008XX-V01-01-page11.txt: [('pa-', 'pa'), ('-', ''), ('-and', 'and'), ('an-', 'an'), ('sev-', 'sev'), ('them-', 'them')] PTAR185008XX-V01-01-page12.txt: [('-reproof.', 'reproof.'), ('Ag-', 'Ag'), ('it-', 'it')] PTAR185008XX-V01-01-page13.txt: [('busi-', 'busi'), ('-has', 'has'), ('-would', 'would'), ('T-', 'T'), ('effect.-', 'effect.')] PTAR185008XX-V01-01-page14.txt: [('MES-', 'MES'), ('-', ''), ('-vision', 'vision'), ('-us', 'us'), ('fin-', 'fin'), ('law-', 'law'), ('-it', 'it'), ('in-', 'in'), ('MESSAGE.-', 'MESSAGE.'), ('determin-', 'determin'), ('Sec-', 'Sec')] PTAR185008XX-V01-01-page15.txt: [('Hast-', 'Hast'), ('-', ''), ('prayer.-', 'prayer.'), ('-', ''), ('judg-', 'judg')] PTAR185008XX-V01-01-page16.txt: [('-', ''), ('uni-', 'uni'), ('-wicked', 'wicked'), ('-blessedness', 'blessedness'), ('-and', 'and')] PTAR185008XX-V01-01-page17.txt: [('-us', 'us')] PTAR185008XX-V01-01-page2.txt: [('large-', 'large'), ('move--', 'move-'), ('oc-', 'oc'), ('inways-', 'inways')] PTAR185008XX-V01-01-page3.txt: [('ut-', 'ut'), ('commence-', 'commence'), ('do-', 'do'), ('-', ''), ('-', ''), ('how-', 'how')] PTAR185008XX-V01-01-page4.txt: [('upo-', 'upo'), ('vari-', 'vari'), ('ex-', 'ex')] PTAR185008XX-V01-01-page5.txt: [('-unaided', 'unaided'), ('-a', 'a'), ('pe-', 'pe'), ('com-', 'com')] PTAR185008XX-V01-01-page6.txt: [('-evil', 'evil'), ('-', ''), ('re-', 're'), ('-...ndles', '...ndles'), ('pray-', 'pray')] PTAR185008XX-V01-01-page7.txt: [('pros-', 'pros'), ('-filled', 'filled')] PTAR185008XX-V01-01-page8.txt: [('vir-', 'vir'), ('EX-', 'EX'), ('-', '')] PTAR185008XX-V01-01-page9.txt: [('-', ''), ('-', ''), ('pa-', 'pa'), ('-', ''), ('be-', 'be'), ('Sec-', 'Sec'), ('-Iam.', 'Iam.'), ('--Ihave', '-Ihave'), ('saved.-', 'saved.'), ('-', ''), ('fa-', 'fa'), ('and-', 'and'), ('pre-', 'pre'), ('.-', '.'), ('-and', 'and')] PTAR185008XX-V01-02-page1.txt: [('group-', 'group'), ('-', ''), ('ediica-', 'ediica'), ('grace---', 'grace--'), ('-the', 'the'), ('door-', 'door'), ("-exceed'", "exceed'"), ('-', ''), ('-', '')] PTAR185008XX-V01-02-page10.txt: [('who-', 'who'), ('-', ''), ('Provi-', 'Provi'), ('hav-', 'hav'), ('-candid"', 'candid"'), ('-', ''), ('-', ''), ('-up', 'up'), ('-partingof', 'partingof'), ('-never', 'never'), ('unim-', 'unim')] PTAR185008XX-V01-02-page11.txt: [('-s', 's'), ('-', '')] PTAR185008XX-V01-02-page12.txt: [('pro-', 'pro'), ('-', ''), ('-at', 'at'), ('-as', 'as'), ('Now-', 'Now'), ('pre-', 'pre'), ('provi-', 'provi'), ('--identified', '-identified'), ('--the', '-the'), ('--and', '-and'), ('tar-', 'tar')] PTAR185008XX-V01-02-page13.txt: [('expound-', 'expound'), ('-', ''), ('-', ''), ('with-', 'with'), ('disap-', 'disap')] PTAR185008XX-V01-02-page14.txt: [('ADVENT-', 'ADVENT'), ('explana-', 'explana'), ('dis-', 'dis'), ('-at', 'at'), ('-othertiine.', 'othertiine.'), ('illustration.-', 'illustration.'), ('-', ''), ('-in', 'in'), ('-answer', 'answer'), ('Provi-', 'Provi'), ('Matt.-', 'Matt.'), ('fulfill-', 'fulfill'), ('ie-', 'ie'), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('fallbili-', 'fallbili'), ('re-', 're'), ('-after', 'after'), ('-consequent', 'consequent')] PTAR185008XX-V01-02-page15.txt: [('ar-', 'ar'), ('-', ''), ('-"', '"'), ('--', '-'), ('approv-', 'approv'), ('--', '-'), ('-shall', 'shall'), ('Je-', 'Je'), ('-', ''), ('-', '')] PTAR185008XX-V01-02-page16.txt: [('the--', 'the-'), ('-"the', '"the'), ('-Advent', 'Advent'), ('God.-', 'God.'), ('-now', 'now'), ('need-', 'need'), ('abol-', 'abol'), ('tes-', 'tes'), ('-it', 'it'), ('"ever-', '"ever'), ('Je-', 'Je'), ('Arch-', 'Arch'), ('-perpetuity', 'perpetuity'), ('as-', 'as'), ('-trifling', 'trifling'), ('assem-', 'assem'), ('DAY."-', 'DAY."'), ('-occasioned', 'occasioned'), ('in-', 'in'), ('-haus.', 'haus.'), ('-', ''), ('Lord---', 'Lord--'), ('-', ''), ('-', ''), ('-words', 'words'), ('-of', 'of'), ('-the', 'the'), ('-', '')] PTAR185008XX-V01-02-page2.txt: [('-', ''), ('-', ''), ('imme-', 'imme')] PTAR185008XX-V01-02-page3.txt: [('-Virgins', 'Virgins'), ('specified-', 'specified'), ('-get', 'get'), ('-', ''), ('-that', 'that'), ('-', ''), ('pro-', 'pro'), ('-separate', 'separate'), ('dis-', 'dis'), ('lOtb.-', 'lOtb.'), ('-month', 'month'), ('-midnight', 'midnight'), ('-a', 'a'), ('-finally', 'finally'), ('prOcIa-', 'prOcIa')] PTAR185008XX-V01-02-page4.txt: [('-', ''), ('-', '')] PTAR185008XX-V01-02-page5.txt: [('con-', 'con'), ('.-', '.'), ('be-', 'be'), ('burn-', 'burn'), ('ex-', 'ex'), ('him.--', 'him.-'), ('po-', 'po'), ('-', ''), ('-a', 'a'), ('do-', 'do'), ('Vfaxis-', 'Vfaxis'), ('-', '')] PTAR185008XX-V01-02-page6.txt: [('-and', 'and'), ('-', ''), ('AND-', 'AND'), ('-from', 'from'), ('mes-', 'mes'), ('-after', 'after')] PTAR185008XX-V01-02-page7.txt: [('-.devising', '.devising'), ('-', ''), ('-plicityof', 'plicityof'), ('The-', 'The'), ('-we', 'we')] PTAR185008XX-V01-02-page8.txt: [('Crea-', 'Crea'), ('-', ''), ('in-', 'in'), ('there-', 'there'), ('un-', 'un'), ('recorded-', 'recorded'), ('-In', 'In'), ('rerip-', 'rerip'), ('tend-', 'tend'), ('--the', '-the'), ('prophet-', 'prophet'), ('-which', 'which'), ('"-', '"'), ('cor-', 'cor')] PTAR185008XX-V01-02-page9.txt: [('empires-', 'empires'), ('minu-', 'minu'), ('com-', 'com'), ('ser-', 'ser'), ('num-', 'num'), ('con-', 'con'), ('witness-', 'witness'), ('Je-', 'Je')] PTAR185009XX-V01-03-page1.txt: [('-', ''), ('-', ''), ('suc-', 'suc'), ('print-', 'print'), ('-', ''), ('CERTAINTY-', 'CERTAINTY'), ('-to', 'to'), ('-', ''), ('success-', 'success')] PTAR185009XX-V01-03-page10.txt: [('Is-', 'Is'), ('-other', 'other')] PTAR185009XX-V01-03-page11.txt: [('ADVENT-', 'ADVENT'), ('cove-', 'cove'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Sanctua-', 'Sanctua'), ('cove-', 'cove'), ('-', '')] PTAR185009XX-V01-03-page12.txt: [('Sam.-', 'Sam.'), ('-Aholiab', 'Aholiab'), ('-wisdom', 'wisdom'), ('chant-', 'chant'), ('-vessele', 'vessele'), ('-', ''), ('for-', 'for'), ('taberna-', 'taberna'), ('-', ''), ('-', ''), ('-in', 'in'), ('-', ''), ('-thereof', 'thereof'), ('-set', 'set'), ('-Lord', 'Lord'), ('"Purifica-', '"Purifica'), ('desir--', 'desir-'), ('-', ''), ('Sanctuaryl-', 'Sanctuaryl')] PTAR185009XX-V01-03-page13.txt: [('-', ''), ('-prove', 'prove'), ('-Aaron', 'Aaron'), ('-', ''), ('-', ''), ('-', ''), ('-land', 'land'), ('-', ''), ('bet-', 'bet'), ('con-', 'con'), ('-he', 'he'), ('offollowed-', 'offollowed'), ('-ef', 'ef'), ('.-', '.'), ('flesh-', 'flesh'), ('-', ''), ('-also', 'also')] PTAR185009XX-V01-03-page14.txt: [('-heaven', 'heaven'), ('of-', 'of'), ('of-', 'of'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('individu-', 'individu'), ('-New', 'New'), ('Le-', 'Le')] PTAR185009XX-V01-03-page15.txt: [('-', ''), ('enter-', 'enter'), ('-', ''), ('---Last', '--Last')] PTAR185009XX-V01-03-page16.txt: [('-', ''), ('-', ''), ('-', ''), ('-they', 'they'), ('-', ''), ('wa-', 'wa'), ('-', ''), ('seventy-', 'seventy'), ('meet-', 'meet'), ('re-', 're')] PTAR185009XX-V01-03-page2.txt: [('-heaven', 'heaven'), ('proph-', 'proph'), ('inter-', 'inter'), ('in-', 'in')] PTAR185009XX-V01-03-page3.txt: [('char-', 'char'), ('de-', 'de'), ('wil-', 'wil'), ('com-', 'com'), ('-God', 'God'), ('-To', 'To')] PTAR185009XX-V01-03-page4.txt: [('-"', '"'), ('-', ''), ('-', ''), ('-', ''), ('employ-', 'employ'), ('-is', 'is'), ('-to', 'to'), ('remain-', 'remain')] PTAR185009XX-V01-03-page5.txt: [('-as', 'as'), ('the-', 'the'), ('IVIan-', 'IVIan'), ('Bride-', 'Bride'), ('corn-', 'corn'), ('an-', 'an')] PTAR185009XX-V01-03-page6.txt: [('Ad-', 'Ad'), ('de-', 'de'), ('un-', 'un'), ('Command-', 'Command'), ('com-', 'com'), ('-', ''), ('"-', '"'), ('the-', 'the'), ('fool-', 'fool'), ('NOT-', 'NOT'), ('ser-', 'ser'), ('-', '')] PTAR185009XX-V01-03-page7.txt: [('"In-', '"In'), ('-God', 'God'), ('-away', 'away'), ('-', ''), ('pave-', 'pave'), ('-as', 'as'), ('-a', 'a'), ('Ad-', 'Ad'), ('-', '')] PTAR185009XX-V01-03-page8.txt: [('Jeho-', 'Jeho'), ('first.-', 'first.'), ('-', ''), ('-', ''), ('coun-', 'coun'), ('-', ''), ('since.-', 'since.'), ('-not', 'not'), ('-', ''), ('great-', 'great'), ('-', ''), ('pub-', 'pub'), ('-', ''), ('-all', 'all'), ('-had', 'had'), ('-Those', 'Those'), ('-', '')] PTAR185009XX-V01-03-page9.txt: [('-', ''), ('than-', 'than'), ('-', ''), ('-observes', 'observes'), ('-maintain', 'maintain'), ('-compel', 'compel'), ('----', '---'), ('-', ''), ('-', ''), ('-It', 'It'), ('clannessed-', 'clannessed'), ('-the', 'the'), ('us-', 'us'), ('--still', '-still'), ('treatit.-', 'treatit.'), ("hereafter.'-", "hereafter.'"), ('-known', 'known'), ('-', ''), ('-according', 'according'), ('unwil-', 'unwil'), ('-', ''), ('-your', 'your'), ('-', ''), ('Veri-', 'Veri'), ('-verily', 'verily'), ('-what', 'what'), ('to-', 'to'), ('now-', 'now'), ('-been', 'been'), ('-belong', 'belong'), ('-ordinances', 'ordinances'), ('-does', 'does'), ('-his', 'his'), ('--on', '-on'), ('imervation.--L-', 'imervation.--L'), ('-who', 'who')] PTAR185009XX-V01-04-page1.txt: [('DAY-', 'DAY'), ('-', ''), ('-But', 'But'), ('THOU-', 'THOU'), ('world-', 'world'), ('-', ''), ('-', ''), ('JUDG-', 'JUDG')] PTAR185009XX-V01-04-page10.txt: [('-', ''), ('-', ''), ('atone-', 'atone'), ('-', ''), ('-', '')] PTAR185009XX-V01-04-page11.txt: [('-', ''), ('-', ''), ('reali-', 'reali'), ('ini--', 'ini-'), ('-', ''), ('-definite', 'definite'), ('atone-', 'atone'), ('taber-', 'taber'), ('-', ''), ('-', '')] PTAR185009XX-V01-04-page12.txt: [('-', ''), ('trans-', 'trans'), ('for-', 'for'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-"baptized', '"baptized')] PTAR185009XX-V01-04-page13.txt: [('do-', 'do'), ('objec-', 'objec'), ('in-', 'in'), ('hea-', 'hea'), ('abomi-', 'abomi'), ('-', '')] PTAR185009XX-V01-04-page14.txt: [('Je-', 'Je'), ('un-', 'un'), ('wasful-', 'wasful'), ('-examined.', 'examined.'), ('sa-', 'sa'), ('-', ''), ('-', ''), ('He-', 'He')] PTAR185009XX-V01-04-page15.txt: [('-without', 'without'), ('un-', 'un'), ('Won-', 'Won'), ('cleans-', 'cleans')] PTAR185009XX-V01-04-page16.txt: [('-truth.', 'truth.'), ('Eno-', 'Eno'), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', '')] PTAR185009XX-V01-04-page2.txt: [('-', ''), ('na-', 'na'), ('-from', 'from')] PTAR185009XX-V01-04-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-but', 'but'), ('--He', '-He'), ('A-', 'A'), ('-Bro.', 'Bro.'), ("---'", "--'")] PTAR185009XX-V01-04-page4.txt: [('--', '-'), ('before.--', 'before.-'), ('vol-', 'vol')] PTAR185009XX-V01-04-page5.txt: [('ac-', 'ac'), ('-', ''), ('as-', 'as'), ('fulfilled.--', 'fulfilled.-'), ('under-', 'under'), ('faith-', 'faith'), ('Universal-', 'Universal')] PTAR185009XX-V01-04-page6.txt: [('infor-', 'infor'), ('di-', 'di'), ('-', ''), ('yes.---', 'yes.--'), ('-ahead.', 'ahead.'), ('-the', 'the')] PTAR185009XX-V01-04-page7.txt: [('-', ''), ('satisfac-', 'satisfac'), ('scarce-', 'scarce'), ('-unbound', 'unbound'), ('faith--', 'faith-'), ('-', ''), ('-.', '.'), ('-', '')] PTAR185009XX-V01-04-page8.txt: [('-', ''), ('-For', 'For'), ('THOU-', 'THOU'), ('-', ''), ('-apply', 'apply'), ('-was', 'was'), ('-', ''), ('ser-', 'ser'), ('Rev.-', 'Rev.'), ('-', ''), ('-', ''), ('sup-', 'sup')] PTAR185009XX-V01-04-page9.txt: [('-', ''), ('saying-', 'saying'), ('-', ''), ('up-', 'up'), ('-', ''), ('-', ''), ('-', ''), ('be-', 'be'), ('-at', 'at'), ('-', ''), ('P-', 'P'), ('-', ''), ('forgiVe-', 'forgiVe'), ('-', ''), ('-Lord', 'Lord'), ('-court', 'court'), ('I-', 'I')] PTAR185009XX-V01-EX-page10.txt: [('the-', 'the'), ('moun-', 'moun'), ('super-', 'super'), ('two-horn-', 'two-horn'), ('-is', 'is')] PTAR185009XX-V01-EX-page11.txt: [('-', ''), ('Mon-', 'Mon'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('promi-', 'promi'), ('-', ''), ('-Itt', 'Itt'), ('-a', 'a'), ('-', ''), ('COMMAND-', 'COMMAND'), ('-', ''), ('con-', 'con')] PTAR185009XX-V01-EX-page12.txt: [('of-', 'of'), ('-day', 'day'), ('-', ''), ('-', ''), ('-', ''), ('enjoin-', 'enjoin'), ('-', '')] PTAR185009XX-V01-EX-page13.txt: [('-', ''), ('-', ''), ('-', ''), ('WON-', 'WON'), ('-', ''), ('re-', 're'), ('--the', '-the'), ('with-', 'with'), ('salva-', 'salva')] PTAR185009XX-V01-EX-page14.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('ta-', 'ta'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('him-', 'him'), ('corn-', 'corn')] PTAR185009XX-V01-EX-page15.txt: [('Sanc-', 'Sanc'), ('-borne', 'borne'), ('FIL-', 'FIL'), ('per-', 'per'), ('giV-', 'giV'), ('-and', 'and'), ('ene-', 'ene'), ('-', '')] PTAR185009XX-V01-EX-page16.txt: [('-', ''), ('-but', 'but'), ('heav-', 'heav'), ('-', ''), ('pre-', 'pre'), ('-', ''), ('-', ''), ('de-', 'de'), ('na-', 'na'), ('trans-', 'trans')] PTAR185009XX-V01-EX-page17.txt: [('-', ''), ('af-', 'af'), ('un-', 'un'), ('desti-', 'desti'), ("-ever.'", "ever.'"), ('FIL-', 'FIL'), ('--', '-'), ('-', ''), ('--', '-'), ('--', '-')] PTAR185009XX-V01-EX-page2.txt: [('-', ''), ('stood.-', 'stood.'), ('-.As', '.As'), ('unnecessa-', 'unnecessa'), ('RE-', 'RE'), ('truth.--', 'truth.-'), ('-art', 'art'), ('hot."-', 'hot."'), ('-Like', 'Like'), ('-day', 'day'), ('-', ''), ('an-', 'an'), ('-', ''), ('door-', 'door'), ('-they', 'they')] PTAR185009XX-V01-EX-page3.txt: [('-', ''), ('af-', 'af'), ('com-', 'com'), ('-mandments', 'mandments'), ('-', '')] PTAR185009XX-V01-EX-page4.txt: [('-', ''), ('the-cengre-', 'the-cengre'), ('"Be-', '"Be'), ('-', ''), ('-', ''), ('-shut', 'shut'), ('-the', 'the'), ('-white', 'white'), ('ex-', 'ex'), ('-', ''), ('death.-', 'death.'), ('-there', 'there'), ('-', ''), ('-to', 'to'), ('-Eze.', 'Eze.'), ('-', ''), ('-', ''), ('-life', 'life'), ('hea-', 'hea'), ('-rebuke', 'rebuke'), ('-', ''), ('.--', '.-'), ('-', ''), ('al-', 'al'), ('-who', 'who'), ('-yet', 'yet'), ('-nearer-a', 'nearer-a'), ('class-', 'class')] PTAR185009XX-V01-EX-page5.txt: [('Jeru-', 'Jeru'), ('in-', 'in'), ('es-', 'es'), ('-', ''), ('after-', 'after'), ('-not', 'not'), ('-', ''), ('anti-', 'anti'), ("-Ione.'", "Ione.'")] PTAR185009XX-V01-EX-page6.txt: [('-in', 'in'), ('-we', 'we'), ('Jo-', 'Jo'), ('-', ''), ('away-', 'away'), ('ad-', 'ad'), ('-', '')] PTAR185009XX-V01-EX-page7.txt: [('Corn-', 'Corn'), ('-', ''), ('ser-', 'ser'), ('scat-', 'scat'), ('-', ''), ('pow-', 'pow'), ('pro-', 'pro'), ('-Herald', 'Herald'), ('-', ''), ('-', ''), ('-of', 'of'), ('-do', 'do'), ('-be', 'be'), ('ii-', 'ii'), ('-', ''), ('-', ''), ('-', ''), ('field.-', 'field.')] PTAR185009XX-V01-EX-page8.txt: [('-steppinginto', 'steppinginto'), ('-', ''), ('ad-', 'ad'), ('the-', 'the')] PTAR185009XX-V01-EX-page9.txt: [('-says', 'says'), ('-See', 'See'), ('-', ''), ('-', ''), ('-', '')] PTAR185011XX-V01-05-page1.txt: [('-.-', '.-'), ('-Publishing', 'Publishing'), ('\'"ittee-', '\'"ittee'), ('-', ''), ('-wellthatthey', 'wellthatthey'), ('BABY-', 'BABY'), ('-', ''), ('founda-', 'founda'), ('-', ''), ('--Hence', '-Hence'), ('-out', 'out'), ('-the', 'the'), ('-to', 'to'), ("'try-", "'try"), ('organized-', 'organized'), ('-declaring', 'declaring'), ('-', ''), ('and-', 'and'), ('-', ''), ('intro-', 'intro'), ('-ferithe', 'ferithe'), ('-', ''), ('-', ''), ('in-', 'in'), ('-', ''), ('-arose', 'arose'), ('-had', 'had'), ('-What', 'What'), ('-hat', 'hat'), ('-in', 'in'), ('-', ''), ('-lac-', 'lac-'), ('-the', 'the'), ('-', '')] PTAR185011XX-V01-05-page2.txt: [('proces-', 'proces'), ('-of', 'of'), ('proclai-', 'proclai'), ('-my', 'my'), ('-Now', 'Now'), ('per-', 'per')] PTAR185011XX-V01-05-page3.txt: [('-with', 'with'), ('-', ''), ('quoting-', 'quoting'), ('-represents', 'represents'), ('be-', 'be'), ('-agreeing.', 'agreeing.'), ('harmo-', 'harmo'), ('there--', 'there-'), ('-of', 'of'), ('ceas-', 'ceas'), ('-', '')] PTAR185011XX-V01-05-page4.txt: [('-them', 'them'), ('-', ''), ('mes-', 'mes'), ('--just', '-just'), ('-seventy-five', 'seventy-five'), ('-God', 'God'), ('-unbelievers.', 'unbelievers.'), ('-', ''), ('-of', 'of'), ('clo-', 'clo'), ('-July', 'July'), ('-', ''), ('Advent-Li-', 'Advent-Li'), ('No..-', 'No..'), ('mes--', 'mes-'), ('man-', 'man'), ('of-', 'of'), ("-mother's", "mother's"), ('-great', 'great'), ('unparal-', 'unparal'), ('-seventlimonth.', 'seventlimonth.'), ('some-', 'some'), ('-something', 'something')] PTAR185011XX-V01-05-page5.txt: [('-are', 'are'), ('-experience', 'experience'), ('fin-', 'fin'), ('-Gone', 'Gone'), ('-', ''), ('-', ''), ('-to', 'to'), ('be-', 'be'), ('emir-', 'emir'), ('-we', 'we'), ('-is', 'is')] PTAR185011XX-V01-05-page6.txt: [('-Upon', 'Upon'), ('-with', 'with'), ('-the', 'the'), ('how-', 'how'), ('-she', 'she'), ('tall-', 'tall'), ('-the', 'the'), ('-dren', 'dren'), ('res-', 'res'), ('-', ''), ('Jeru-', 'Jeru'), ('carbun-', 'carbun'), ('Jeru-', 'Jeru'), ('-her', 'her'), ('Je-', 'Je'), ('ino-', 'ino')] PTAR185011XX-V01-05-page7.txt: [('-Does', 'Does'), ('Je-', 'Je'), ('--Hold', '-Hold'), ('Lord.God-', 'Lord.God'), ('-for', 'for'), ('-', ''), ('-', ''), ('-', ''), ('--"It', '-"It'), ('-not', 'not'), ('idea.-', 'idea.'), ('-', ''), ('kiah.-', 'kiah.'), ('-', ''), ('-', ''), ('-"Know', '"Know'), ('-', '')] PTAR185011XX-V01-05-page8.txt: [('solo-', 'solo'), ('command--', 'command-'), ('-beloVed', 'beloVed'), ('rims-', 'rims'), ('-Share', 'Share'), ('-the', 'the'), ('-answer', 'answer'), ('-church', 'church'), ('done.-', 'done.'), ('-by', 'by'), ('faith-', 'faith'), ('prejudi-', 'prejudi'), ('life-', 'life'), ('-tat', 'tat'), ('walk-', 'walk'), ('ordinances-', 'ordinances'), ('-which', 'which'), ('par-', 'par'), ('-The', 'The'), ('-', ''), ('-prospect', 'prospect'), ('-work', 'work'), ('The.-', 'The.'), ('-partially', 'partially'), ('-', ''), ('-', ''), ('im-', 'im'), ('-arnongthe', 'arnongthe'), ('to-', 'to'), ('-', ''), ('Louata-', 'Louata'), ('-', ''), ('-', ''), ('-The', 'The'), ('-', ''), ('-be', 'be')] PTAR185011XX-V01-11-page1.txt: [('-TRUTH', 'TRUTH'), ('-', ''), ("Thro'-", "Thro'"), ('-..abbotti', '..abbotti'), ('-', '')] PTAR185011XX-V01-11-page2.txt: [('dan-', 'dan'), ('called-', 'called'), ('-', ''), ('in-', 'in'), ('-the', 'the'), ('coetro-', 'coetro'), ('Pa--', 'Pa-'), ('commandwe-', 'commandwe'), ('harm-', 'harm'), ('observe-', 'observe')] PTAR185011XX-V01-11-page3.txt: [('OB-', 'OB'), ('-No', 'No'), ('commandment.-', 'commandment.'), ('spe-', 'spe'), ('-such', 'such'), ('exposi-', 'exposi'), ('-seventh.', 'seventh.'), ('-', ''), ('-indefinitely."', 'indefinitely."'), ('fol-', 'fol'), ('-proves.', 'proves.'), ('guilt-', 'guilt'), ("'how-", "'how")] PTAR185011XX-V01-11-page4.txt: [('-day', 'day'), ('Scrip-', 'Scrip'), ('restora-', 'restora'), ('there-', 'there'), ('an-', 'an'), ('sus-', 'sus'), ('-', ''), ('ex-', 'ex'), ('re-', 're')] PTAR185011XX-V01-11-page5.txt: [('-T.', 'T.'), ('-started', 'started'), ('-unto', 'unto'), ('-all', 'all'), ('York-', 'York'), ('HOLT-', 'HOLT')] PTAR185011XX-V01-11-page6.txt: [('-of', 'of'), ('-of', 'of'), ('-he', 'he'), ('-to', 'to'), ('separa-', 'separa'), ('ar-', 'ar'), ('-saints', 'saints'), ('-', '')] PTAR185011XX-V01-11-page7.txt: [('-that', 'that'), ('c.-', 'c.'), ('the-', 'the'), ('-and', 'and'), ('-', ''), ('Testa-', 'Testa'), ('-', ''), ('-', ''), ('Gentiles.-', 'Gentiles.'), ('chap--', 'chap-'), ('-See', 'See'), ('-', ''), ('-', ''), ('-', ''), ('direct-', 'direct'), ('-of', 'of'), ('blind-', 'blind')] PTAR185011XX-V01-11-page8.txt: [('em-', 'em'), ('-', ''), ('an-', 'an'), ("'-", "'"), ('-', ''), ('-.', '.'), ('rug-', 'rug'), ('Sab-', 'Sab'), ('-and', 'and')] PTAR1850XXXX-VXX-XX-page1.txt: [('-page', 'page')] PTAR1850XXXX-VXX-XX-page10.txt: [('-', ''), ('-', ''), ('-', ''), ('Sec-', 'Sec'), ('saved.-', 'saved.'), ('-', ''), ('pre-', 'pre'), ('.-', '.')] PTAR1850XXXX-VXX-XX-page11.txt: [('wick-', 'wick'), ("--'", "-'"), ('-understand', 'understand'), ("--'", "-'")] PTAR1850XXXX-VXX-XX-page12.txt: [('pa-', 'pa'), ('-', '')] PTAR1850XXXX-VXX-XX-page13.txt: [('Millerism.--', 'Millerism.-'), ('-will', 'will'), ('imperiously-', 'imperiously'), ('-', ''), ('long-', 'long'), ('AP-', 'AP'), ('it-', 'it')] PTAR1850XXXX-VXX-XX-page14.txt: [('busi-', 'busi'), ('recent-', 'recent'), ('world-', 'world')] PTAR1850XXXX-VXX-XX-page15.txt: [('-what', 'what'), ('sal-', 'sal'), ('-know', 'know'), ('-', ''), ('-shall', 'shall'), ('admon.-', 'admon.')] PTAR1850XXXX-VXX-XX-page16.txt: [('-', ''), ('territo-', 'territo')] PTAR1850XXXX-VXX-XX-page17.txt: [('-days.', 'days.'), ('-', ''), ('-', '')] PTAR1850XXXX-VXX-XX-page18.txt: [('ever-', 'ever'), ('Bride-', 'Bride'), ('acknowledge-', 'acknowledge'), ('-Babylon', 'Babylon'), ('-', ''), ('po-', 'po'), ('torch-', 'torch'), ('-', '')] PTAR1850XXXX-VXX-XX-page19.txt: [('do-', 'do'), ('Mans-', 'Mans'), ('-', ''), ('-', ''), ('glo-', 'glo')] PTAR1850XXXX-VXX-XX-page20.txt: [('fa-', 'fa'), ('lay-', 'lay'), ('-', ''), ('eannotr--', 'eannotr-'), ('-the', 'the')] PTAR1850XXXX-VXX-XX-page21.txt: [('Burn-', 'Burn'), ('-present', 'present'), ('-truth', 'truth'), ('proclama-', 'proclama'), ('can-', 'can'), ('-', ''), ('oversight--', 'oversight-'), ('to-', 'to'), ('dread-', 'dread'), ('-', '')] PTAR1850XXXX-VXX-XX-page22.txt: [('-preservation', 'preservation'), ('-to', 'to'), ('spe-', 'spe'), ('Del-', 'Del')] PTAR1850XXXX-VXX-XX-page23.txt: [('Bonn-', 'Bonn'), ('not."--', 'not."-'), ('witness-', 'witness'), ('Je-', 'Je'), ('counter-', 'counter'), ('conflagra-', 'conflagra'), ('be-', 'be'), ('-', ''), ('-in', 'in'), ('occur-', 'occur'), ('con-', 'con')] PTAR1850XXXX-VXX-XX-page24.txt: [('wit-', 'wit'), ('wit-', 'wit'), ('-it.', 'it.'), ('-', ''), ('-t', 't')] PTAR1850XXXX-VXX-XX-page25.txt: [('-', ''), ('--fill', '-fill'), ('christen-', 'christen'), ('predict-', 'predict')] PTAR1850XXXX-VXX-XX-page26.txt: [('Ad-', 'Ad'), ('pre-', 'pre'), ('provi-', 'provi'), ('tar-', 'tar'), ('-', ''), ('-the', 'the'), ('trans-', 'trans')] PTAR1850XXXX-VXX-XX-page27.txt: [('natural-', 'natural'), ('-ness', 'ness'), ('-him', 'him'), ('--of', '-of'), ('-', ''), ('believe.--', 'believe.-'), ('peo-', 'peo')] PTAR1850XXXX-VXX-XX-page28.txt: [('-through', 'through'), ('tes-', 'tes'), ('-', ''), ('-', '')] PTAR1850XXXX-VXX-XX-page29.txt: [('com-', 'com'), ('unwor-', 'unwor'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] PTAR1850XXXX-VXX-XX-page3.txt: [('giv-', 'giv'), ('ad-', 'ad'), ('move-', 'move'), ('al-', 'al')] PTAR1850XXXX-VXX-XX-page30.txt: [('-', ''), ('-', ''), ('-', ''), ('Sab-', 'Sab'), ('-all', 'all'), ('Je-', 'Je')] PTAR1850XXXX-VXX-XX-page31.txt: [('-', ''), ('the-', 'the'), ('-or', 'or'), ('ener-', 'ener'), ('-', ''), ('-on', 'on'), ('-Saviour', 'Saviour')] PTAR1850XXXX-VXX-XX-page32.txt: [('-', ''), ('de-', 'de'), ('wil-', 'wil')] PTAR1850XXXX-VXX-XX-page33.txt: [('-', ''), ('-world', 'world')] PTAR1850XXXX-VXX-XX-page34.txt: [('histo-', 'histo'), ('-', ''), ('-a', 'a'), ('-attached', 'attached'), ('re-', 're'), ('ex-', 'ex'), ('-', '')] PTAR1850XXXX-VXX-XX-page35.txt: [('anthe-', 'anthe'), ('representa-', 'representa'), ('ac-', 'ac'), ('C.-', 'C.'), ('af-', 'af'), ('-', ''), ('-failed.', 'failed.'), ('Ad-', 'Ad'), ('de-', 'de'), ('-vice', 'vice'), ('-it', 'it'), ('un-', 'un'), ('Command-', 'Command')] PTAR1850XXXX-VXX-XX-page36.txt: [('experience-', 'experience'), ('--better', '-better'), ('-', ''), ('-of', 'of')] PTAR1850XXXX-VXX-XX-page37.txt: [('-', ''), ('an-', 'an'), ('-lie', 'lie'), ('re-', 're'), ('can-', 'can'), ('-', ''), ('an-', 'an'), ('-', ''), ('-', ''), ('Isa.-', 'Isa.'), ('iii.-', 'iii.'), ('-', ''), ('-', ''), ('-', ''), ('coun-', 'coun'), ('repu-', 'repu'), ('re-', 're')] PTAR1850XXXX-VXX-XX-page38.txt: [('-', ''), ('-', ''), ('wit-', 'wit'), ('-', '')] PTAR1850XXXX-VXX-XX-page39.txt: [('-his', 'his'), ('unwil-', 'unwil'), ('-', ''), ('tradi-', 'tradi'), ('-', ''), ('-', ''), ('-was', 'was'), ('-to', 'to'), ('-', ''), ("-MILNER'S", "MILNER'S"), ('-', '')] PTAR1850XXXX-VXX-XX-page4.txt: [('-', ''), ('--in', '-in'), ('how-', 'how')] PTAR1850XXXX-VXX-XX-page40.txt: [('world-', 'world'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('cleans-', 'cleans'), ('-', ''), ('-', ''), ('Solo-', 'Solo')] PTAR1850XXXX-VXX-XX-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('"-', '"'), ('-', '')] PTAR1850XXXX-VXX-XX-page42.txt: [('-', ''), ('-', ''), ('.-', '.'), ('bet-', 'bet'), ('con-', 'con'), ('-', ''), ('.-', '.')] PTAR1850XXXX-VXX-XX-page43.txt: [('of--', 'of-'), ('sanctified-', 'sanctified'), ('-I', 'I'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('individu-', 'individu'), ('Le-', 'Le'), ('atone-', 'atone'), ('-xiv', 'xiv')] PTAR1850XXXX-VXX-XX-page44.txt: [('-', ''), ('accord.-', 'accord.'), ('-shall', 'shall'), ('hal-', 'hal'), ('-', ''), ('-', ''), ('-', ''), ('tres-', 'tres'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] PTAR1850XXXX-VXX-XX-page45.txt: [('-', ''), ('-', ''), ('-"', '"'), ('-the', 'the'), ('un-', 'un'), ('atone-', 'atone'), ('-', ''), ('Thurnmimlight-', 'Thurnmimlight'), ('atone-', 'atone'), ('-', ''), ('per-', 'per'), ('-the.', 'the.')] PTAR1850XXXX-VXX-XX-page46.txt: [('trans-', 'trans'), ('-', ''), ('rebel-', 'rebel'), ('-', ''), ('-', ''), ('-S', 'S'), ('atone-', 'atone'), ('iniqui-', 'iniqui'), ('-', ''), ('atone-', 'atone'), ('"-', '"'), ('taber-', 'taber'), ('per-', 'per')] PTAR1850XXXX-VXX-XX-page47.txt: [('-', ''), ('trans-', 'trans'), ('-Priest', 'Priest'), ('for-', 'for'), ('IIo-', 'IIo'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('cleans-', 'cleans'), ('Dispen-', 'Dispen')] PTAR1850XXXX-VXX-XX-page48.txt: [('-', ''), ('-and', 'and'), ('the-', 'the'), ('-Just', 'Just'), ('-it', 'it'), ('-is', 'is'), ('-the', 'the')] PTAR1850XXXX-VXX-XX-page49.txt: [('-', ''), ('Je-', 'Je'), ('un-', 'un'), ('-', ''), ('ini-', 'ini'), ('ful-', 'ful'), ('-', ''), ('unre-', 'unre'), ('sa-', 'sa'), ('-as', 'as'), ('himself.L---', 'himself.L--'), ('-', ''), ('He-', 'He')] PTAR1850XXXX-VXX-XX-page5.txt: [('vari-', 'vari'), ('-', ''), ('oppo-', 'oppo'), ('-millennium', 'millennium')] PTAR1850XXXX-VXX-XX-page50.txt: [('un-', 'un'), ('Won-', 'Won'), ('Sanc-', 'Sanc'), ('up-', 'up'), ('habited."-', 'habited."'), ('"Be-', '"Be'), ('-mesmer', 'mesmer'), ('-the', 'the'), ('-for', 'for'), ('scape-', 'scape'), ('-from', 'from')] PTAR1850XXXX-VXX-XX-page51.txt: [('-page', 'page'), ('RE-', 'RE')] PTAR1850XXXX-VXX-XX-page6.txt: [('pe-', 'pe'), ('com-', 'com')] PTAR1850XXXX-VXX-XX-page7.txt: [('cer-', 'cer'), ('-', ''), ('-do', 'do'), ('Gam-', 'Gam'), ('re-', 're'), ("'Kock-", "'Kock"), ('pray-', 'pray'), ('-of', 'of'), ('-fallen.', 'fallen.'), ('thy-', 'thy'), ('-', '')] PTAR1850XXXX-VXX-XX-page8.txt: [('pros--', 'pros-'), ('reach-', 'reach'), ('-', ''), ('-continual', 'continual'), ('Bal-', 'Bal')] PTAR1850XXXX-VXX-XX-page9.txt: [('vir-', 'vir'), ('-very', 'very'), ('-', ''), ('-JUSTIFY', 'JUSTIFY'), ('con-', 'con'), ('jus-', 'jus'), ('-from', 'from'), ("-Lot's", "Lot's")]
Check Correction 3¶
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction3 Average verified rate: 0.9599526551389643 Average of error rates: 0.039791304347826094 Total token count: 224734
In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[("'", 565), ('th', 161), ('ch', 132), ('d', 117), ('n', 77), ('ex', 75), ('ver', 74), ('e', 71), ('t', 68), ('x', 58), ("'the", 56), ('m', 54), ('w', 53), ('ment', 46), ("the'", 45), ('r', 37), ('tion', 34), ('ly', 33), ('f', 27), ('g', 27), ('re', 25), ("'of", 23), ("and'", 20), ('ments', 20), ("'and", 18), ("to'", 18), ('eze', 17), ('ry', 17), ('br', 17), ('vt', 15), ('nant', 13), ("'to", 13), ('ful', 13), ('tuary', 12), ('tions', 12), ('es', 12), ('cy', 11), ('un', 11), ("'was", 11)]
Correction 4 -- Remove Extra Quotation Marks¶
In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = cycle
cycle = "correction4"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
corrections = []
for token in tokens:
token_list = list(token)
last_char = token_list[-1]
if last_char is "'":
if len(token) > 1:
if token_list[-2] is 's' or 'S':
pass
else:
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
elif token[0] is "'":
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
if len(corrections) > 0:
print('{}: {}'.format(filename, corrections))
for correction in corrections:
content = clean.replace_pair(correction, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
PTAR184907XX-V01-01-page3.txt: [("'these", 'these')] PTAR184907XX-V01-01-page5.txt: [("'from", 'from')] PTAR184907XX-V01-01-page7.txt: [("'violate", 'violate')] PTAR184907XX-V01-01-page8.txt: [("'by", 'by')] PTAR184907XX-V01-01-page9.txt: [("'of", 'of'), ("'keep", 'keep')] PTAR184908XX-V01-02-page1.txt: [("'the", 'the')] PTAR184908XX-V01-02-page8.txt: [("'act", 'act')] PTAR184908XX-V01-03-page3.txt: [("'TILL", 'TILL')] PTAR184908XX-V01-03-page7.txt: [("'gave", 'gave')] PTAR184909XX-V01-04-page1.txt: [("'away", 'away')] PTAR184909XX-V01-04-page4.txt: [("'that", 'that'), ("'present", 'present')] PTAR184909XX-V01-04-page5.txt: [("'on", 'on')] PTAR184912XX-V01-05-page2.txt: [("'and", 'and'), ("'recently", 'recently'), ("'Nov.", 'Nov.'), ("'impressed", 'impressed'), ("'Moser", 'Moser'), ("'of", 'of'), ("'the", 'the'), ("'into", 'into'), ("'him", 'him'), ("'the", 'the')] PTAR184912XX-V01-05-page3.txt: [("'had", 'had'), ("'Me", 'Me'), ("'Rhodes", 'Rhodes'), ("'Itelpli", 'Itelpli'), ("'all.", 'all.'), ("'among", 'among'), ("'I", 'I'), ("'the", 'the'), ("'feared", 'feared'), ("'Bro.", 'Bro.'), ("'be.sPre", 'be.sPre'), ("'hy", 'hy')] PTAR184912XX-V01-05-page4.txt: [("'should", 'should')] PTAR184912XX-V01-05-page6.txt: [("'of", 'of')] PTAR184912XX-V01-05-page8.txt: [("'WHITE", 'WHITE'), ("'my", 'my'), ("'You", 'You'), ("'closes", 'closes'), ("'on", 'on'), ("'isle", 'isle'), ("'Spirit", 'Spirit')] PTAR184912XX-V01-06-page1.txt: [("'St.", 'St.')] PTAR184912XX-V01-06-page2.txt: [("'perfectly", 'perfectly')] PTAR184912XX-V01-06-page4.txt: [("'It", 'It'), ("'some", 'some'), ("'seeing", 'seeing'), ("'the", 'the')] PTAR184912XX-V01-06-page5.txt: [("'of", 'of'), ("'to", 'to')] PTAR184912XX-V01-06-page6.txt: [("'we", 'we')] PTAR185003XX-V01-07-page2.txt: [("'vas", 'vas')] PTAR185003XX-V01-07-page4.txt: [("'to", 'to')] PTAR185003XX-V01-07-page5.txt: [("'the", 'the')] PTAR185003XX-V01-07-page6.txt: [("'THE", 'THE'), ("'Gal.", 'Gal.'), ("'chapter", 'chapter'), ("'Aid", 'Aid'), ("'retired", 'retired'), ("'God", 'God'), ("'was", 'was'), ("'whe", 'whe'), ("'if", 'if'), ("'nine", 'nine'), ("'COMMAND", 'COMMAND'), ("'is", 'is'), ("'keeping", 'keeping'), ("'Wise", 'Wise')] PTAR185003XX-V01-07-page7.txt: [("'.", '.'), ("'verse", 'verse'), ("'righteous", 'righteous'), ("'of", 'of'), ("'whO", 'whO'), ("'inder", 'inder'), ("'Services", 'Services'), ("'ofthe", 'ofthe'), ("'The", 'The'), ("'IjEATH", 'IjEATH'), ("'The", 'The'), ("'is", 'is'), ("'cern", 'cern'), ("'the", 'the')] PTAR185003XX-V01-07-page8.txt: [("'covenant", 'covenant'), ("'the", 'the'), ("'.", '.')] PTAR185003XX-V01-08-page1.txt: [("'one", 'one')] PTAR185003XX-V01-08-page2.txt: [("'half", 'half'), ("'they", 'they'), ("'the", 'the'), ("'Me", 'Me'), ("'us", 'us'), ("'are", 'are'), ("'the", 'the'), ("'you", 'you')] PTAR185003XX-V01-08-page3.txt: [("'of", 'of'), ("'ehang.", 'ehang.'), ("'to", 'to'), ("'.'eiti'the", '.eitithe'), ("'those", 'those'), ("'by", 'by'), ("'righteousness", 'righteousness')] PTAR185003XX-V01-08-page4.txt: [("'above", 'above'), ("'One", 'One'), ('\'"', '"'), ("'was", 'was'), ("'and", 'and'), ("'the", 'the'), ("'be", 'be'), ("'can", 'can'), ("'elucidation", 'elucidation')] PTAR185003XX-V01-08-page5.txt: [("'two", 'two'), ("'a", 'a')] PTAR185003XX-V01-08-page6.txt: [("'Seventy", 'Seventy'), ("'of", 'of'), ("'the", 'the'), ("'A.", 'A.'), ("'or", 'or'), ("'seventieth", 'seventieth'), ("'is", 'is'), ("'chronology", 'chronology'), ("'a", 'a'), ("'or", 'or'), ("'one", 'one'), ("'of", 'of'), ("'of", 'of'), ("'then", 'then'), ("'point", 'point'), ("'why", 'why'), ("'were", 'were'), ("'until", 'until'), ("'given", 'given'), ("'but", 'but'), ("'true", 'true'), ("'chronology", 'chronology'), ("'Then", 'Then'), ("'the", 'the'), ("'the", 'the'), ("'the", 'the'), ("'commences.", 'commences.'), ("'Then", 'Then'), ("'confirmation", 'confirmation'), ("'Messiah.", 'Messiah.')] PTAR185003XX-V01-08-page7.txt: [("'step", 'step')] PTAR185003XX-V01-08-page8.txt: [("'TRUTH.", 'TRUTH.'), ("'AULT.", 'AULT.')] PTAR185004XX-V01-09-page1.txt: [("'reject", 'reject')] PTAR185004XX-V01-09-page2.txt: [("'the", 'the'), ("'message", 'message'), ("'God", 'God')] PTAR185004XX-V01-09-page3.txt: [("''commandments", 'commandments'), ("'lived", 'lived'), ("'which", 'which'), ("'Pope", 'Pope'), ("'the", 'the')] PTAR185004XX-V01-09-page4.txt: [("'God", 'God'), ("'patterns", 'patterns'), ("'an", 'an')] PTAR185004XX-V01-09-page5.txt: [("'drink", 'drink')] PTAR185004XX-V01-09-page6.txt: [("'that", 'that'), ("'his", 'his'), ("'our", 'our')] PTAR185004XX-V01-09-page7.txt: [("'nights", 'nights'), ("'I", 'I'), ("'sacrifice", 'sacrifice')] PTAR185005XX-V01-10-page2.txt: [("'jewels", 'jewels'), ("'months", 'months')] PTAR185005XX-V01-10-page3.txt: [("'ten", 'ten'), ("'since", 'since')] PTAR185005XX-V01-10-page4.txt: [("'are", 'are')] PTAR185005XX-V01-10-page5.txt: [("'Sanctuary", 'Sanctuary')] PTAR185005XX-V01-10-page7.txt: [("'and", 'and')] PTAR185008XX-V01-01-page10.txt: [("'this", 'this'), ("'And", 'And'), ("'to", 'to'), ("'And", 'And')] PTAR185008XX-V01-01-page11.txt: [("'ivork", 'ivork'), ("'it", 'it')] PTAR185008XX-V01-01-page13.txt: [("'done", 'done'), ("'them", 'them'), ('\'should"be', 'should"be'), ("'old", 'old')] PTAR185008XX-V01-01-page14.txt: [("'souls", 'souls')] PTAR185008XX-V01-01-page15.txt: [("'We", 'We'), ("'forty", 'forty'), ("'by", 'by')] PTAR185008XX-V01-01-page16.txt: [("'up", 'up'), ("'was", 'was')] PTAR185008XX-V01-01-page2.txt: [("'to", 'to'), ("'.", '.')] PTAR185008XX-V01-01-page3.txt: [("'each", 'each'), ("'their", 'their')] PTAR185008XX-V01-01-page4.txt: [("'selfish", 'selfish')] PTAR185008XX-V01-01-page6.txt: [("'was", 'was'), ("'for", 'for'), ("'I", 'I')] PTAR185008XX-V01-01-page7.txt: [("'midnight", 'midnight'), ("'after", 'after'), ("'false", 'false')] PTAR185008XX-V01-01-page9.txt: [("'is", 'is'), ("'answer", 'answer'), ("'Blessed", 'Blessed'), ("'to", 'to')] PTAR185008XX-V01-02-page1.txt: [("'opening", 'opening'), ("'Upon", 'Upon'), ("'and", 'and'), ("'on", 'on'), ("'little", 'little'), ("'was", 'was'), ("'everyfthing", 'everyfthing'), ("'Then", 'Then'), ("'as", 'as'), ("'sympathies", 'sympathies')] PTAR185008XX-V01-02-page10.txt: [("'around", 'around'), ("'they", 'they'), ("'will", 'will'), ("'All", 'All'), ("'edged", 'edged'), ("'being", 'being')] PTAR185008XX-V01-02-page11.txt: [("'them", 'them'), ("'down", 'down')] PTAR185008XX-V01-02-page12.txt: [("'to", 'to'), ("'There", 'There'), ("'be", 'be'), ("'of", 'of'), ("'Providence", 'Providence'), ("'vision", 'vision'), ("'in", 'in'), ("'the", 'the')] PTAR185008XX-V01-02-page13.txt: [("'tarrying.", 'tarrying.'), ("'him", 'him')] PTAR185008XX-V01-02-page14.txt: [("'rescripttire", 'rescripttire'), ("'question", 'question'), ("'despised.", 'despised.'), ("'fulfillment.", 'fulfillment.'), ("'were", 'were'), ("'by", 'by'), ("'for", 'for'), ("'Scipleship.", 'Scipleship.'), ("'from", 'from'), ("'if", 'if'), ("'or", 'or'), ("'all", 'all'), ("'prophecy", 'prophecy'), ("'supremacy", 'supremacy'), ("'whtCh", 'whtCh')] PTAR185008XX-V01-02-page16.txt: [("'testimony", 'testimony'), ("'speak", 'speak'), ("'the", 'the'), ("'give", 'give'), ("'AllParts", 'AllParts'), ("'you", 'you'), ("'fastened", 'fastened'), ('\'Church."', 'Church."'), ("'a", 'a'), ("'Provil", 'Provil'), ("'Unworthy", 'Unworthy'), ("'Divine", 'Divine'), ("'speak", 'speak')] PTAR185008XX-V01-02-page3.txt: [("'were", 'were'), ("'interwoven", 'interwoven'), ("''That", 'That'), ("'saw.", 'saw.'), ("'before", 'before'), ("'Perhaps", 'Perhaps'), ("'We", 'We'), ("'generation", 'generation'), ("'Christ", 'Christ'), ("'dear", 'dear'), ("'virgins", 'virgins'), ("'Matt..xxv", 'Matt..xxv'), ("'cry", 'cry'), ("'to", 'to'), ("'the", 'the')] PTAR185008XX-V01-02-page4.txt: [("'or", 'or')] PTAR185008XX-V01-02-page5.txt: [("'LAMP", 'LAMP'), ("'Many", 'Many'), ("'why", 'why'), ("'We", 'We'), ("'Strong", 'Strong'), ('\'Esdras."', 'Esdras."'), ("'they", 'they')] PTAR185008XX-V01-02-page7.txt: [("'cannot", 'cannot'), ("'of", 'of'), ("'Gentile", 'Gentile'), ("'as", 'as'), ("'Little", 'Little'), ("'permission", 'permission'), ("'out", 'out'), ("'stupendous", 'stupendous'), ("'cold", 'cold')] PTAR185008XX-V01-02-page8.txt: [("'record", 'record'), ("'litho", 'litho'), ("'of", 'of'), ("'with", 'with'), ("'roll", 'roll'), ("'the", 'the'), ("'we", 'we'), ("'the", 'the'), ("'If", 'If'), ("'the", 'the'), ("'and", 'and'), ("'scrap", 'scrap')] PTAR185008XX-V01-02-page9.txt: [("'triumph", 'triumph'), ("'Providence", 'Providence'), ("'servants", 'servants')] PTAR185009XX-V01-03-page1.txt: [("'the", 'the'), ("'of", 'of'), ('\'Word."', 'Word."')] PTAR185009XX-V01-03-page10.txt: [("'be", 'be'), ("'falsehood", 'falsehood'), ("'single", 'single'), ("'conduct", 'conduct'), ("'Holy", 'Holy'), ("'hive", 'hive'), ("'second", 'second'), ("'asked", 'asked'), ("'motive", 'motive'), ("'perfect", 'perfect'), ("'but", 'but')] PTAR185009XX-V01-03-page11.txt: [("'religious", 'religious'), ("'this", 'this')] PTAR185009XX-V01-03-page12.txt: [("'understanding", 'understanding'), ("'patteins", 'patteins'), ("'Lev", 'Lev'), ("'bring", 'bring'), ("'thine", 'thine'), ("'Lord", 'Lord'), ("'the", 'the'), ("'quoted", 'quoted'), ("'the", 'the')] PTAR185009XX-V01-03-page13.txt: [("'was", 'was')] PTAR185009XX-V01-03-page14.txt: [("'self", 'self')] PTAR185009XX-V01-03-page15.txt: [("'with", 'with')] PTAR185009XX-V01-03-page16.txt: [("'future", 'future'), ("'in", 'in'), ("'the", 'the')] PTAR185009XX-V01-03-page2.txt: [("'confirmed", 'confirmed'), ("'cannot", 'cannot')] PTAR185009XX-V01-03-page3.txt: [("'ascend", 'ascend'), ("'They", 'They')] PTAR185009XX-V01-03-page4.txt: [("'all", 'all'), ("'Judgment", 'Judgment'), ("'denying", 'denying')] PTAR185009XX-V01-03-page5.txt: [("'the", 'the'), ("'after", 'after'), ("'prophecy.", 'prophecy.'), ("'These", 'These'), ("'cross", 'cross'), ("'Second", 'Second')] PTAR185009XX-V01-03-page6.txt: [("'Hos", 'Hos')] PTAR185009XX-V01-03-page7.txt: [("'God", 'God'), ("'coin", 'coin'), ("'falling", 'falling'), ("'trial", 'trial'), ("'scenes", 'scenes'), ("'CANNOT", 'CANNOT'), ("'Messi", 'Messi'), ("'.not", '.not'), ("'WILLING", 'WILLING'), ("'Ohrist'as", 'Ohristas'), ("'.slough", '.slough'), ("'the", 'the'), ("'have", 'have'), ("'is", 'is'), ("'Amen", 'Amen'), ("'the", 'the')] PTAR185009XX-V01-03-page8.txt: [("'John's", 'Johns'), ("'has", 'has'), ("'of", 'of'), ("'cannot", 'cannot'), ("'Messiah.", 'Messiah.'), ("'who", 'who'), ("'wrong", 'wrong'), ("'Heaven", 'Heaven'), ("'that", 'that'), ("'being", 'being'), ("'King", 'King'), ("'right", 'right'), ("'prior", 'prior'), ("'the", 'the'), ("'hen", 'hen')] PTAR185009XX-V01-03-page9.txt: [("'crisis", 'crisis'), ("'without", 'without'), ("'camp", 'camp'), ("'For", 'For'), ("'held", 'held'), ("'having", 'having'), ("'untoAle", 'untoAle'), ("'that", 'that'), ("'done", 'done'), ("'the", 'the'), ("'beneath", 'beneath'), ("'you.", 'you.'), ("'command", 'command'), ("'names", 'names'), ("'withgreat", 'withgreat'), ("'are", 'are')] PTAR185009XX-V01-04-page1.txt: [("'years", 'years'), ("'shall", 'shall'), ("'could", 'could')] PTAR185009XX-V01-04-page10.txt: [("'Holy", 'Holy')] PTAR185009XX-V01-04-page11.txt: [("'These", 'These'), ("'the", 'the'), ("'Aaron", 'Aaron'), ("'was", 'was'), ("'the", 'the'), ("'bear", 'bear')] PTAR185009XX-V01-04-page12.txt: [("'and", 'and'), ("'of", 'of'), ("'Upon", 'Upon')] PTAR185009XX-V01-04-page13.txt: [("'entering", 'entering')] PTAR185009XX-V01-04-page14.txt: [("'was", 'was'), ("'opinion", 'opinion'), ("'whether", 'whether'), ("'behind", 'behind'), ("'aplace", 'aplace')] PTAR185009XX-V01-04-page16.txt: [("'Advent", 'Advent'), ("'the", 'the'), ("'their", 'their'), ("'N.", 'N.')] PTAR185009XX-V01-04-page3.txt: [("'AND", 'AND'), ("'new", 'new'), ("'REIGN", 'REIGN')] PTAR185009XX-V01-04-page4.txt: [("'the", 'the'), ("'their", 'their'), ("'His", 'His'), ("'And", 'And')] PTAR185009XX-V01-04-page6.txt: [("'judgment", 'judgment'), ("'saying", 'saying'), ("'looking", 'looking'), ("'year", 'year'), ("'This", 'This'), ("'saying", 'saying'), ("'the", 'the'), ("'had", 'had')] PTAR185009XX-V01-04-page7.txt: [("'We", 'We'), ("'To", 'To'), ("'have", 'have'), ("'Advent", 'Advent')] PTAR185009XX-V01-04-page8.txt: [("'their", 'their'), ("'Be", 'Be'), ("'having", 'having'), ("'Philadelphia", 'Philadelphia'), ("'I", 'I')] PTAR185009XX-V01-04-page9.txt: [("'foly", 'foly'), ("'which", 'which'), ("'first", 'first'), ("'Saviour", 'Saviour'), ("'it", 'it'), ("'we", 'we'), ("'he", 'he'), ("'his", 'his'), ("'ready", 'ready'), ("'believe", 'believe'), ("'of", 'of'), ("'After", 'After'), ("'And", 'And'), ("'that", 'that'), ("'he", 'he'), ("'Then", 'Then'), ("'in", 'in')] PTAR185009XX-V01-EX-page10.txt: [("'tried", 'tried'), ("'that", 'that')] PTAR185009XX-V01-EX-page11.txt: [("'othe.s", 'othe.s'), ("'restored", 'restored'), ("'the", 'the'), ("'dwell", 'dwell')] PTAR185009XX-V01-EX-page12.txt: [("'hath", 'hath'), ("'willing", 'willing')] PTAR185009XX-V01-EX-page13.txt: [("'because", 'because'), ("'and", 'and')] PTAR185009XX-V01-EX-page14.txt: [("'the", 'the'), ('\'"', '"')] PTAR185009XX-V01-EX-page15.txt: [("'died", 'died'), ("'according", 'according'), ("'a", 'a'), ("'the", 'the'), ("'Neither", 'Neither'), ("'law", 'law'), ("'was", 'was'), ("'declaring", 'declaring'), ("'is", 'is')] PTAR185009XX-V01-EX-page16.txt: [("'lived", 'lived'), ("'restitution", 'restitution'), ("'the", 'the'), ("'there", 'there'), ("'the", 'the'), ("'caught", 'caught'), ("'the", 'the')] PTAR185009XX-V01-EX-page17.txt: [("'keepcure", 'keepcure'), ("'HE", 'HE')] PTAR185009XX-V01-EX-page2.txt: [("'gospel", 'gospel'), ("'Auditean", 'Auditean'), ("'v.", 'v.'), ("'God", 'God'), ("'most", 'most'), ("'requires", 'requires')] PTAR185009XX-V01-EX-page3.txt: [("'and", 'and'), ("'general", 'general'), ("'tables", 'tables')] PTAR185009XX-V01-EX-page4.txt: [("'account", 'account'), ("'before.", 'before.'), ("'urge", 'urge')] PTAR185009XX-V01-EX-page5.txt: [("'them", 'them'), ("'roaring", 'roaring'), ("'of", 'of'), ("'See", 'See')] PTAR185009XX-V01-EX-page6.txt: [("'moss", 'moss'), ("'raise", 'raise')] PTAR185009XX-V01-EX-page7.txt: [("'GREAT", 'GREAT'), ("'Ahab", 'Ahab'), ("'sought", 'sought'), ("'Art", 'Art'), ("'Advent", 'Advent'), ("'of", 'of'), ("'preacher", 'preacher'), ("'exerciseth", 'exerciseth'), ("'Belau", 'Belau'), ("'to", 'to'), ("'his", 'his'), ("'be", 'be'), ("'Get", 'Get')] PTAR185009XX-V01-EX-page8.txt: [("'to", 'to'), ("'fit", 'fit'), ("'of", 'of'), ("'altar", 'altar'), ("'WIFE.", 'WIFE.')] PTAR185009XX-V01-EX-page9.txt: [("'roar", 'roar'), ("'And", 'And'), ("'darkening", 'darkening')] PTAR185011XX-V01-05-page1.txt: [('\'"ittee', '"ittee'), ("'BATES.", 'BATES.'), ("'Babylon", 'Babylon'), ("'try", 'try'), ("'Jesus", 'Jesus')] PTAR185011XX-V01-05-page2.txt: [("'calls", 'calls'), ("'voice", 'voice'), ("'given", 'given'), ("'simple", 'simple'), ("'of", 'of')] PTAR185011XX-V01-05-page3.txt: [("'the", 'the'), ("'necessity", 'necessity'), ("'been", 'been'), ("'what", 'what')] PTAR185011XX-V01-05-page4.txt: [("'udgment", 'udgment'), ("'city", 'city'), ("'difference", 'difference'), ("'fulfillment", 'fulfillment'), ("'heard", 'heard'), ("'seemed", 'seemed')] PTAR185011XX-V01-05-page5.txt: [("'virgins", 'virgins'), ("'point", 'point'), ("'denied", 'denied'), ("'courage", 'courage'), ("'was", 'was'), ("'before", 'before'), ("'of", 'of'), ("'to", 'to'), ("'better", 'better'), ("'late", 'late'), ("'been", 'been')] PTAR185011XX-V01-05-page6.txt: [("'And", 'And'), ("'Holy", 'Holy')] PTAR185011XX-V01-05-page7.txt: [("'King", 'King'), ("'it", 'it'), ("'had", 'had')] PTAR185011XX-V01-05-page8.txt: [("'of", 'of'), ("'mine", 'mine'), ("'from", 'from'), ("'Spirit", 'Spirit'), ("'interesting", 'interesting'), ("'thirst", 'thirst'), ("'feed", 'feed'), ("'shall", 'shall'), ("'Of'God", 'OfGod'), ("'river", 'river'), ("'from", 'from'), ("'GO.d", 'GO.d'), ("'and", 'and'), ("'the", 'the')] PTAR185011XX-V01-11-page1.txt: [("'the", 'the')] PTAR185011XX-V01-11-page3.txt: [("'but", 'but'), ("'how", 'how')] PTAR185011XX-V01-11-page4.txt: [("'ever", 'ever')] PTAR185011XX-V01-11-page5.txt: [("'BRO.", 'BRO.')] PTAR185011XX-V01-11-page6.txt: [("'Jesus", 'Jesus'), ("'clearly", 'clearly'), ("'of", 'of'), ("'his", 'his'), ("'faces", 'faces')] PTAR185011XX-V01-11-page7.txt: [("'that", 'that'), ("'Gentiles", 'Gentiles')] PTAR185011XX-V01-11-page8.txt: [("'There", 'There'), ("'learned", 'learned')] PTAR1850XXXX-VXX-XX-page10.txt: [("'BLISS", 'BLISS'), ("'is", 'is'), ("'answer", 'answer'), ("'every", 'every'), ("'Blessed", 'Blessed')] PTAR1850XXXX-VXX-XX-page11.txt: [("'He", 'He'), ("'And", 'And')] PTAR1850XXXX-VXX-XX-page12.txt: [("'and", 'and')] PTAR1850XXXX-VXX-XX-page13.txt: [("'Voice", 'Voice')] PTAR1850XXXX-VXX-XX-page16.txt: [("'Upon", 'Upon')] PTAR1850XXXX-VXX-XX-page19.txt: [("'connection", 'connection'), ("'rifled.", 'rifled.'), ("'why", 'why'), ("'We", 'We')] PTAR1850XXXX-VXX-XX-page23.txt: [("'did", 'did')] PTAR1850XXXX-VXX-XX-page27.txt: [("'forget", 'forget'), ("'t", 't')] PTAR1850XXXX-VXX-XX-page29.txt: [("'believed", 'believed')] PTAR1850XXXX-VXX-XX-page3.txt: [("'course", 'course'), ("'GOD", 'GOD')] PTAR1850XXXX-VXX-XX-page30.txt: [("'himself", 'himself'), ("'me", 'me'), ("'Lord", 'Lord')] PTAR1850XXXX-VXX-XX-page31.txt: [("'heart", 'heart'), ("'meek", 'meek')] PTAR1850XXXX-VXX-XX-page32.txt: [("'designs.", 'designs.')] PTAR1850XXXX-VXX-XX-page34.txt: [("'atching", 'atching'), ("'the", 'the')] PTAR1850XXXX-VXX-XX-page35.txt: [("'Covenant", 'Covenant'), ("'Amen.", 'Amen.'), ("'trust", 'trust'), ("'the", 'the')] PTAR1850XXXX-VXX-XX-page36.txt: [("'Behold", 'Behold'), ("'present", 'present')] PTAR1850XXXX-VXX-XX-page37.txt: [("'mai", 'mai'), ("'f'ound", 'found'), ("'Heaven", 'Heaven')] PTAR1850XXXX-VXX-XX-page38.txt: [("'cannot", 'cannot'), ("'Devlish", 'Devlish')] PTAR1850XXXX-VXX-XX-page39.txt: [("'unto", 'unto'), ("'that", 'that'), ("'If", 'If'), ("'Verily", 'Verily'), ("'that", 'that'), ("'The", 'The'), ("'a", 'a'), ("'Holy", 'Holy')] PTAR1850XXXX-VXX-XX-page4.txt: [("'shall", 'shall'), ("'as", 'as'), ("'first", 'first')] PTAR1850XXXX-VXX-XX-page40.txt: [("'Ex.", 'Ex.'), ("'was", 'was')] PTAR1850XXXX-VXX-XX-page43.txt: [("'The", 'The')] PTAR1850XXXX-VXX-XX-page44.txt: [("'Ver.", 'Ver.')] PTAR1850XXXX-VXX-XX-page45.txt: [("'their", 'their')] PTAR1850XXXX-VXX-XX-page46.txt: [("'.", '.')] PTAR1850XXXX-VXX-XX-page48.txt: [("'place", 'place'), ("'but", 'but'), ("'door", 'door'), ("'my", 'my')] PTAR1850XXXX-VXX-XX-page49.txt: [("'Iv", 'Iv'), ("'other", 'other')] PTAR1850XXXX-VXX-XX-page50.txt: [("'will", 'will')] PTAR1850XXXX-VXX-XX-page6.txt: [("'key", 'key'), ("'matter.", 'matter.'), ("'day", 'day')] PTAR1850XXXX-VXX-XX-page7.txt: [("'were", 'were'), ("'only", 'only'), ("'Kock", 'Kock'), ("'exclaimed", 'exclaimed'), ("'will", 'will'), ("'in", 'in'), ("'Ninevah's", 'Ninevahs'), ("'he", 'he'), ("'that", 'that'), ("'kingdom", 'kingdom')]
Check Correction 4¶
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction4 Average verified rate: 0.9628746245411058 Average of error rates: 0.03694782608695653 Total token count: 224725
In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[("'", 543), ('th', 161), ('ch', 132), ('d', 117), ('n', 79), ('ex', 76), ('ver', 75), ('e', 71), ('t', 69), ('x', 58), ('m', 54), ('w', 53), ('ment', 46), ("the'", 45), ('r', 37), ('tion', 35), ('ly', 33), ('f', 28), ('g', 27), ('re', 26), ('ments', 21), ("and'", 20), ("to'", 18), ('eze', 17), ('br', 17), ('ry', 17), ('vt', 15), ('nant', 13), ('ful', 13), ('tuary', 12), ('tions', 12), ('es', 12), ('cy', 11), ('un', 11)]
Correction 5 -- Rejoin Split Words¶
In [25]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction5"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
PTAR184907XX-V01-01-page5.txt: [('COV', 'ENANT')] PTAR184908XX-V01-02-page8.txt: [('pre', 'cious')] PTAR184908XX-V01-03-page2.txt: [('LOV', 'ED')] PTAR184908XX-V01-03-page8.txt: [('th', 'at')] PTAR184909XX-V01-04-page3.txt: [('DISPER', 'SIONS')] PTAR184912XX-V01-05-page1.txt: [('IMMORTALI', 'TY'), ('IMMOR', 'TALITY'), ('th', 'em')] PTAR184912XX-V01-05-page2.txt: [('JUSTIFI', 'CATION'), ('inte', 'rest')] PTAR184912XX-V01-05-page6.txt: [('th', 'e')] PTAR184912XX-V01-05-page8.txt: [('TR', 'UTH')] PTAR184912XX-V01-06-page2.txt: [('TABER', 'NACLE')] PTAR184912XX-V01-06-page7.txt: [('eigh', 't')] PTAR185003XX-V01-07-page4.txt: [('TA', 'BLES')] PTAR185003XX-V01-07-page5.txt: [('Tes', 'tament')] PTAR185003XX-V01-07-page6.txt: [('fal', 'len')] PTAR185003XX-V01-07-page7.txt: [('em', 'braced')] PTAR185003XX-V01-08-page1.txt: [('noth', 'ing')] PTAR185003XX-V01-08-page3.txt: [('re', 'main')] PTAR185003XX-V01-08-page4.txt: [('sanctua', 'ry')] PTAR185003XX-V01-08-page5.txt: [('HEAV', 'ENLY'), ('circum', 'stances')] PTAR185003XX-V01-08-page7.txt: [('PEO', 'PLE'), ('PLE', 'A')] PTAR185004XX-V01-09-page2.txt: [('MESSA', 'GE')] PTAR185004XX-V01-09-page5.txt: [('Ex', 'tra')] PTAR185005XX-V01-10-page1.txt: [('pre', 'pare')] PTAR185005XX-V01-10-page2.txt: [('ta', 'Me')] PTAR185005XX-V01-10-page4.txt: [('IMA', 'GES'), ('HEAV', 'ENS')] PTAR185008XX-V01-01-page11.txt: [('Mor', 'iah')] PTAR185008XX-V01-01-page13.txt: [('TA', 'N')] PTAR185008XX-V01-01-page16.txt: [('co', 'worker')] PTAR185008XX-V01-01-page2.txt: [('oc', 'H')] PTAR185008XX-V01-01-page6.txt: [('re', 'SEARCHING')] PTAR185008XX-V01-01-page8.txt: [('EX', 'PECT')] PTAR185008XX-V01-01-page9.txt: [('th', 'or'), ('ment', 'on')] PTAR185008XX-V01-02-page1.txt: [('Vo', 'L'), ('saTsu', 'MA')] PTAR185008XX-V01-02-page10.txt: [('unim', 'portant')] PTAR185008XX-V01-02-page14.txt: [('re', 'adjust'), ('confe', 'sSing'), ('confes', 's'), ('concep', 'tion')] PTAR185008XX-V01-02-page16.txt: [('Ged', 'or')] PTAR185008XX-V01-02-page5.txt: [('coun', 't')] PTAR185008XX-V01-02-page7.txt: [("ham'", 's')] PTAR185009XX-V01-03-page11.txt: [('blasphe', 'ming')] PTAR185009XX-V01-03-page15.txt: [('od', 'is')] PTAR185009XX-V01-03-page2.txt: [('un', 'impaired')] PTAR185009XX-V01-03-page6.txt: [('un', 'It')] PTAR185009XX-V01-03-page7.txt: [("Christ'", 's')] PTAR185009XX-V01-04-page15.txt: [('th', 'At'), ('al', 'ways')] PTAR185009XX-V01-04-page6.txt: [('re', 'arouse')] PTAR185009XX-V01-04-page7.txt: [('re', 'examining'), ('th', 'e')] PTAR185009XX-V01-04-page9.txt: [('zi', 'n')] PTAR185009XX-V01-EX-page11.txt: [('th', 'y'), ('Itt', 'a')] PTAR185009XX-V01-EX-page2.txt: [('unnecessa', 'ry'), ('fa', 'th')] PTAR185009XX-V01-EX-page5.txt: [('Egy', 'ptians')] PTAR185011XX-V01-05-page1.txt: [('th', 'at')] PTAR185011XX-V01-11-page3.txt: [('OB', 'SERVED')] PTAR185011XX-V01-11-page4.txt: [('sus', 'I')] PTAR185011XX-V01-11-page5.txt: [('PR', 'ES'), ('th', 'at')] PTAR1850XXXX-VXX-XX-page10.txt: [('th', 'or'), ('ment', 'on')] PTAR1850XXXX-VXX-XX-page13.txt: [('Cincin', 'nati')] PTAR1850XXXX-VXX-XX-page18.txt: [('carryin', 'g')] PTAR1850XXXX-VXX-XX-page24.txt: [('tr', 'ek'), ('Ers', 'E')] PTAR1850XXXX-VXX-XX-page25.txt: [('plen', 'ty'), ('obei', 'sance')] PTAR1850XXXX-VXX-XX-page28.txt: [('re', 'adjust')] PTAR1850XXXX-VXX-XX-page3.txt: [('giv', 'en')] PTAR1850XXXX-VXX-XX-page31.txt: [('co', 'exist')] PTAR1850XXXX-VXX-XX-page35.txt: [('un', 'It')] PTAR1850XXXX-VXX-XX-page37.txt: [('re', 'I')] PTAR1850XXXX-VXX-XX-page38.txt: [('FAI', 'L')] PTAR1850XXXX-VXX-XX-page43.txt: [('ex', 'press')] PTAR1850XXXX-VXX-XX-page46.txt: [('TI', 's')] PTAR1850XXXX-VXX-XX-page50.txt: [('th', 'At'), ('al', 'ways')] PTAR1850XXXX-VXX-XX-page51.txt: [('RE', 'In')] PTAR1850XXXX-VXX-XX-page7.txt: [('re', 'SEARCHING'), ('cer', 'tainty')]
Check Correction 5¶
In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction5 Average verified rate: 0.9632425599344794 Average of error rates: 0.03657391304347826 Total token count: 224662
In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[("'", 543), ('th', 155), ('ch', 132), ('d', 117), ('n', 78), ('ver', 75), ('ex', 73), ('e', 68), ('t', 67), ('x', 58), ('m', 54), ('w', 53), ("the'", 45), ('ment', 44), ('r', 37), ('tion', 34), ('ly', 33), ('f', 28), ('g', 27), ('ments', 21), ("and'", 20), ('re', 18), ("to'", 18), ('eze', 17), ('br', 17), ('ry', 17), ('vt', 15), ('nant', 13), ('ful', 13), ('tuary', 12), ('tions', 12), ('cy', 11), ('es', 11)]
Correction 6 -- Rejoin Split Words II¶
In [28]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
PTAR184907XX-V01-01-page3.txt: [('PER', 'PETUAL'), ('COM', 'MANDMENTS')] PTAR184907XX-V01-01-page4.txt: [('cove', 'nant'), ('COVE', 'NANT')] PTAR184907XX-V01-01-page5.txt: [('COMMAND', 'MENTS')] PTAR184908XX-V01-02-page3.txt: [('MIN', 'ISTRATION')] PTAR184908XX-V01-02-page7.txt: [('LAW', 'FUL')] PTAR184908XX-V01-02-page8.txt: [('pre', 'cious')] PTAR184908XX-V01-03-page4.txt: [('COM', 'MANDMENTS')] PTAR184912XX-V01-05-page1.txt: [('th', 'em')] PTAR184912XX-V01-05-page2.txt: [('constant', 'ly')] PTAR184912XX-V01-05-page8.txt: [('TR', 'UTH')] PTAR185003XX-V01-07-page2.txt: [('COM', 'MANDMENTS')] PTAR185003XX-V01-07-page3.txt: [('sin', 'gular')] PTAR185003XX-V01-07-page4.txt: [('TA', 'BLES'), ('cove', 'nant'), ('COMMAND', 'MENTS')] PTAR185003XX-V01-07-page6.txt: [('COMMAND', 'MENTS')] PTAR185003XX-V01-07-page7.txt: [('Minis', 'tration'), ('MINIS', 'TRATION')] PTAR185003XX-V01-08-page3.txt: [('govern', 'ment')] PTAR185003XX-V01-08-page4.txt: [('sanctua', 'ry'), ('apart', 'ment')] PTAR185003XX-V01-08-page7.txt: [('con', 'fidence')] PTAR185004XX-V01-09-page7.txt: [('do', 'minical')] PTAR185005XX-V01-10-page4.txt: [('A', 'fter')] PTAR185005XX-V01-10-page7.txt: [('At', 'tains')] PTAR185008XX-V01-01-page9.txt: [('the', 're')] PTAR185008XX-V01-02-page1.txt: [('saTsu', 'MA'), ('ben', 'eath')] PTAR185008XX-V01-02-page10.txt: [('unim', 'portant')] PTAR185008XX-V01-02-page11.txt: [('s', 'ated')] PTAR185008XX-V01-02-page6.txt: [('so', 'dding')] PTAR185009XX-V01-03-page14.txt: [('of', 'fering')] PTAR185009XX-V01-03-page7.txt: [('reason', 'ers'), ('do', 'ubt'), ('Chris', 'tian')] PTAR185009XX-V01-04-page1.txt: [('JUDG', 'MENT')] PTAR185009XX-V01-04-page14.txt: [('a', 'sa')] PTAR185009XX-V01-04-page6.txt: [('and', 're')] PTAR185009XX-V01-EX-page11.txt: [('COMMAND', 'MENTS')] PTAR185009XX-V01-EX-page13.txt: [('and', 're'), ('gath', 'ering'), ('WON', 'DERS')] PTAR185009XX-V01-EX-page16.txt: [('the', 're')] PTAR185009XX-V01-EX-page2.txt: [('AND', 'RE'), ('c', 'haracter'), ('back', 'slidings'), ('con', 'fidence')] PTAR185009XX-V01-EX-page3.txt: [('com', 'mandments')] PTAR185009XX-V01-EX-page5.txt: [('time', 'ly'), ('no', 'es')] PTAR185009XX-V01-EX-page7.txt: [('d', 'estined')] PTAR185011XX-V01-05-page2.txt: [('united', 'ly')] PTAR185011XX-V01-05-page7.txt: [('set', 'tled')] PTAR185011XX-V01-11-page1.txt: [('P', 'UB')] PTAR185011XX-V01-11-page3.txt: [('maids', 'ervant')] PTAR185011XX-V01-11-page4.txt: [('I', 're'), ('a', 'cer')] PTAR185011XX-V01-11-page7.txt: [('lite', 'ral')] PTAR185011XX-V01-11-page8.txt: [('love', 'ly'), ('dis', 'ciples')] PTAR1850XXXX-VXX-XX-page13.txt: [('Cincin', 'nati')] PTAR1850XXXX-VXX-XX-page2.txt: [('m', 'itt')] PTAR1850XXXX-VXX-XX-page25.txt: [('obei', 'sance')] PTAR1850XXXX-VXX-XX-page33.txt: [('do', 'ubtful')] PTAR1850XXXX-VXX-XX-page39.txt: [('b', 'eep')] PTAR1850XXXX-VXX-XX-page43.txt: [('ha', 'th'), ('of', 'fering')] PTAR1850XXXX-VXX-XX-page45.txt: [('reconcile', 'ment')] PTAR1850XXXX-VXX-XX-page49.txt: [('a', 'sa')] PTAR1850XXXX-VXX-XX-page5.txt: [('be', 'ri')] PTAR1850XXXX-VXX-XX-page7.txt: [('cer', 'tainty'), ('a', 'cer')]
Check Correction 6¶
In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction6 Average verified rate: 0.9634385240062687 Average of error rates: 0.03627391304347827 Total token count: 224608
In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[("'", 543), ('th', 154), ('ch', 132), ('d', 116), ('n', 78), ('ver', 75), ('ex', 73), ('e', 68), ('t', 67), ('x', 58), ('m', 53), ('w', 53), ("the'", 45), ('ment', 43), ('r', 37), ('tion', 34), ('ly', 31), ('f', 28), ('g', 27), ("and'", 20), ("to'", 18), ('eze', 17), ('ments', 17), ('br', 17), ('ry', 17), ('vt', 15), ('re', 13), ('nant', 12), ('ful', 12), ('tuary', 12), ('tions', 12), ('cy', 11)]
Review Remaining Errors¶
In [31]:
reports.docs_with_high_error_rate(summary)
Out[31]:
[]
In [32]:
reports.long_errors(errors_summary)
Out[32]:
(['holichasten', 'entreatfellow', 'the-commandments', 'whicircontained', 'unrighteousimpressions', 'commandinents', 'couragement', 'scriptuthou', "discovered'", 'resurection', 'hearejected', 'incorparating', "instituted'", 'fulfilltnent', 'contrabecause', 'constantorder', 'blasphemeia', 'wateradrenehed', 'thelanguage', 'experiencwithout', 'acqaintance', "thebeliever's", 'opinequally', 'aldisturbance', 'peoclaiming', 'theirjudgment', 'constantmeeting', 'accomparadox', 'wasitttreduced', 'drunken-harlot', "earth'-wherein", 'forguidance', 'immeliately', "prophecies'", 'sciiptiires', 'hvbelieving', 'ahaortlained', 'occacross-bearing', 'commencepectations', "warclownie'r", "adventists'", 'expericially', 'congregution', 'indionation', 'everlusting', 'and-expected', 'seventlimonth', 'positivelydeciared', 'howgdaiamthat', 'wriversiort', 'sabbath-day', 'christiansin', 'imporshould', 'deliverence', 'forgotstruction', 'bescattering', 'testimocomprehend', 'atonesanctuary', 'meneorialwhich', 'isvautitagy', 'thereloreit', 'chastenings', "eustachius's", "the'atonement", "became'acquainted", "we'establish", 'coneersions', 'knovvleclgp', 'everyfthing', 'congrenakedness', 'ofunfaithful', 'commandmentmust', 'truitettutiearro', 'semi-monthly', 'testamentorder', 'rhinistratian', 'connectedeand', 'confotinded', 'opirtienethat', 'proclafused', 'rtghteousness', 'hand-writing', 'manifesttake', 'obstapressly', 'semi-monthly---by', "september'and", "calculated'", 'thetseventhday', 'romangenerid', 'hovedensays', 'leavitigtheir', 'hascorrupted', 'abominationsof', 'teneommandments', 'thattheangel', 'inpreparation', 'noonewhohasreadthe', 'inforination', "creition't'", 'inlookingiorthis', 'mesmerfathers', 'fustlighton', 'politicoreligious', 'avoidfinally', 'purunbelievers', 'knowledgeto', 'exereiselts', 'fairhatution', "durable'ric", 'badacheareth', 'tlieseventh', 'blasphemeus', 'lhtetiodhogitnacal', 'evensingular', 'covegreater', "'afterwards'", 'sinoffering', 'commandmentd', "themselves'", 'chitshowing', 'father-inprepared', 'ransoineeloia', 'wonder-working', 'irresistable', 'inconceivprofessed', 'freighttures', 'infrodaction', "ofswine'stlesb", 'disorgapizers', 'fulfilvictory', 'thenewtestament', 'interferende', 'fhultsandsins', 'prefeelings', 'the-earthly', 'twoinsthutions', 'incessantbath', 'intercedthen', 'assemperished', "jerusalein's", 'accuratainment', 'prereflecting', 'inprovidence', 'underheavenly', 'rescripttire', 'thepreaence', 'terproclamation', 'nsvowientheir', 'felloviship', 'certainlyis', 'finishincorporated', "sabbathday's", 'utterwinter', 'jointheirship', 'expreslamps', 'reviewshould', 'knowlothers', 'tmmediately', 'thereuntoperfect', 'maintaincifies', 'gabrietrefers', 'neweovettant', 'twoministrations', 'wiroverturn', 'therresurre', 'eighthehapters', 'breast-plate', 'fergirtness', "tuary'comprehened", 'somejoyment', 'therrianner', 'sancenclosed', 'soul-thrilling', 'histbriclestimony', 'deprofessedly', 'offeringsffoorr', 'scripfessing', 'ciiiitetlit', 'proviprophetic', 'somehonestly', 'theoparliai', 'sacenelosed', 'cominandments', 'fififiliment', 'atprophetic', 'arscripture', "passengers'", 'sabbathdays', 'frornthataifs', 'continstood', 'commiandnient', 'cowmenquence', 'developement', 'censtired-and', 'greatandnotabledayefthe', 'perfectioifilment', 'nectionbetween', 'eommandnients', "other'errors", 'propheivord', 'congregamake', 'trailsgressions', 'hzitokingbbantidtlatrtitvet', 'trespassoffering', 'publishment', 'preparatoryscenes', "t'tshayts'err", 'shouhclread', "behoovecl'him", 'tresspassed', 'coniktetion', 'ccunmaridnients', 'atonepresent', 'sabbathbreaking', 'affecslumbers', 'preciliation', 'cronologers', 'perioditire', 'unconqueraing', 'commandlation', 'tellsfisilial', 'witnessedthe', 'thefoundation', 'cornmandments', 'spperstition', 'anftrttering', 'paperssince', 'sunplesiewr', 'slaveholding', 'sincommentators', 'setipttires', 'no-commandment', 'conflagracorrupted', 'cdritmencententat', 'putrifaction', 'allimportant', "angers'message", 'anotheresteemeth', 'eityroafill', 'protabernacle', 'xpeciatiens', 'righteousto', 'fulfillxxiv', "declaration'", 'concluinfluence', 'unavoidaing', 'onlyapparent', 'aftefchrist', 'nearlyevery', 'councounterfeit', 'terminatwave', 'thottirenant', 'ourpeetings', 'congrecrati', 'thrgiveness', 'thedoetrine', 'infidelstleny', 'acknowledgeand', 'meetwhatever', 'importurned', 'admonisealing', 'scripresurrection', 'concepthemselves', 'spirrepresented', 'convincedthat', 'guidseasons', 'fruitofthearticles', '---language', 'onuonizavnt', 'pronotinced', 'thenecessity', "my'suffering", 'eammandment', 'orcongregation', 'neartherighttime', "exciting'oecasion", 'calculatiens', 'mahammedans', 'conaccomplishing', 'uncircumeision', 'neliuzhadnezzar', 'thopeculiai', 'meetsolemhly', 'confidencein', 'extravigances', 'appropripassage', 'ectridttatesrien', 'uninformation', 'disappomted', 'revelerstand', 'catareceive', 'fulmovements', 'considerfrom', "proclaimed'", 'offerplaces', 'bejustified', 'ifinnediately', 'philadelchildren', 'thatsitteth', 'sixteenthehapter', 'midnightcry', "separavah's", 'colsoiation', 'yourconsecration', 'univittingly', 'fountairrof', 'munsonville', 'soul-purifying', 'dispeisation', 'publicaprepared', 'illusguests', 'iswerrettaired', 'bridedisappointed', 'proviexistence', 'tnisconceived', 'chromildgycif', 'ascertainterpositions', "representing'", 'pbrasenlogy', 'preachevent', 'thoughtseine', 'direcdinibted', 'apfurnishes', 'cliurchl-that', 'rightmanifest', 'haveesreriep', 'anomplished', 'thoroughlyhealed', 'bedeliberate', 'instrticties', 'rerepresent', 'especiallythose', 'testifytutto', 'preparatoing', 'reproach-of', 'forlittoith', 'rentainethi', 'comtnencing', 'tribulatiom', 'plagueamong', 'forgivenesi', 'congregatiou', 'ratifollowing', 'trtbuicition', 'movetherefore', 'symbolicallyderusalem', 'philadelphi', 'medo-persia', 'ishmaelitish', 'terminatein', 'incorporatedit', 'self-deceived', "his'brethren", 'derangewhich', 'forgiveignorance', 'commadments', "revelation'", 'agitationand', 'accomplishthose', 'sin-offerings', "abandon'our", 'conlingering', 'ourbackelidings', 'evidentpossible', 'atonecording', 'oomniandmettet', 'rootnedoffspring', 'understandpromises', 'soul-destroying', 'messageswere', 'compreheaded', 'towardsthem', 'specifiviously', 'confiverted', 'typiidolators', 'becausethese', 'notnetiftei', 'coreetterrox', 'eveningsacrifice', 'thatieventy', 'withmetehhuunnttupbio', 'precludethe', 'break-plate', 'somepointment', 'faappointnient', 'thatthoumayest', 'decreclaimed', 'retraditions', 'still-rests', 'regumarriage', 'fulfillrested', "by'succession", 'comtnandinent', 'manministry', 'initiseventh', 'minusustain', 'pecessaryto', 'fallibilithese', "be'assigned", 'acceptatrodden', 'intheforenoon', 'scape-goats', 'differoneebetween', 'offerthousand', 'positiveinstitution', 'demonstraentered', 'comfaithful', 'consistedin', 'institutioni', 'astonishthe', 'oththerefitie', 'improvethent', "associated'", 'seeinhabitants', "'alstachius", 'tarmovement', 'mornprepared', "bampfield's", 'blinduessin', 'coivimandments', 'spruce-street', 'christenthe', 'disasterons', 'desolatidns', 'archeumstances', "selves'shall", 'meetcreased', 'readprayers', 'accuradence', 'experiencso', 'confidenttion', "oursaviour's", 'forgivenesslif', 'sevcaptivity', "the'question", 'encourainfluence', 'sublunagroom', 'whichportance', "chronolog'ers", 'demenstrates', 'enragacceptance', 'beenrepresented', 'fnlfidinent', 'thelridegroom', 'comreconcile', 'dominhistory', 'personagesappear', 'comtgandlitents', 'thasacritice', "and'reminds", 'cleantermination', 'commandhave', 'greathurden', 'aeiseveration', 'serighteous', 'hisposition', 'presumptous', 'ecelesiastical', 'everblessfoes', 'thispassageicor', "confirmarrangements'", 'therestitution', 'nebitchadneasar', 'belongingito', 'fulfillnient', 'distingnished', 'consequenge', 'passconfessed', 'deceivedland', 'deaexpressions', 'thoumystery', 'awakenrubbish', 'jeconverted', 'engngetiness', 'mernconkssed', 'expectrimmed', 'revelationk', 'tirtisatenol', 'commandrhents', 'circumstanble', 'subsectnent', 'provfarther', "ou'rblessed", 'itevidently', 'prevideneenever', 'saerifising', 'communiprep', 'ofreligious', 'wholesystem', 'image-beast', "s'uappcolsed", 'their-iniquities', 'reconciliapass', 'unchangable', 'threatnings', "found'within", 'ndperfectfulfilment', 'paramunications', 'eterlsatnddyi', 'combiaiadments', "irkreasing'", 'sancbeareth', 'thesanctuary', 'wildernessof', 'colsolation', 'adventtause', 'wicked-shall', "andhavingha'dthemwashedawayha", 'mentionwhich', 'cleanssomething', 'yourassertion', 'thefulfillment', 'exarriining', 'counterfeitprovidence', 'performinot', 'sabbathkeeping', 'providencebe', 'witdescribed', 'translasabbath', 'prophecyand', 'destructiou', 'cornquestion', "close'eonnection", 'hauglitiness', 'righteoussaid', 'pprelielicied', 'everyvestige', 'confessionof', 'thescriptures', 'thatbelieve', 'tiiinacqeas', 'back-slidden', 'baknowledge', 'troubviilttake', 'countersalvation', 'derangecovenant', 'clamorabout', 'seventhchapter', 'dorninicalday', 'eontemtranslator', 'soul-stirring', 'repronounce', 'evidencethat-he', 'instituticin', 'theininistration', 'parighteous', 'transgressorsunder', 'whichtrodden', 'faithfulwitness', 'jerusawrought', 'commandmente', 'sublunairksome', 'creawitnessed', 'desirableday', 'iriterferred', "endeavoring'", 'throtighlim', 'rejectedthis', "conviction'", 'oodpromised', 'ailwasidsome', 'scrippeople', 'badtheeffect', 'thetnselves', 'abbathintothefirst', 'circumcisedin', 'aldiscovered', 'importinfinthefigfiliment', 'disapbelievers', 'resurreetion', 'beth-shemesh', 'isolatevdent', 'prejlidices', 'orsalvation', 'fuladvoeated', 'procamation', 'tdeincrgenses', 'ceritreport', 'preparationt', 'cargiveness', 'commandmesntof', 'prodoctrine', 'blessidentifid', 'backfulfilled', 'knowlcross-bearing', 'theroselies', 'inconceivagainst', 'ancleentinued', 'circumstanthe', 'thronestast', 'terminatioa', 'testimoniesin', 'wasnailekto', "expressive'", 'interpositon', 'startingspointot', "vather'airight", 'englandyniton', 'ungressions', 'sabanointed', "jesus'lovely", 'assuranceis', 'sanetifieth', 'crucifixipti', 'thatinoment', 'wilderblood', 'occaministry', 'shuntpossible', 'teachconceptions', 'scapederful', 'sublime-thought', 'sotereignty', 'offerpowerful', 'predicttime', 'saebalthrsalelitwactstruspr', 'fifty-eighth', 'recencilitheir', "macknight's", 'andrespectable', 'nterethiaestbe', 'attendwithout', 'theabominations', 'steppinginto', 'particubath', 'commandnient', 'choinhabitants', 'sanctuaryof', 'descendeorrune', 'tetclielden', 'revealdence', 'fiemembranze', 'wellthatthey', 'burnwaiting', 'definitetime', 'witthsrised', 'andijeliverance', 'tistisriony', 'righteousor', 'ofcultivation', 'stitchregard', 'pereconciled', 'disaolution', 'merchrndise', 'fontainbletm', 'iitterifice', 'guidthelonb', 'cornforfing', 'fearlesssummation', "his'daughter", 'havingrejected', 'tiventyrfive', 'gyochtottize', 'nebuchadnezar', 'building--many', 'backalidings', 'eraphatithe', 'ordinancesof', 'arititypical', 'wouldanswer', 'denyingthat', 'siinplicity', 'theinterpretation', 'humilliation', 'motradiction', "shalt'bring", 'pretendthat', 'ourdisappointthent', 'abankindred', 'scatteringt', 'thurnmimlight', 'pheiscourtette', 'ceintaining', 'inteligence', 'thbeitiannhato', 'commandmentand', 'intersessor', 'desoliation', 'confidentbe', 'jesuslaught', 'fulfilledat', 'hventreyiew', 'immormoment', 'aupperaarepared', 'propiiesyings', 'christjetoil', 'crucifixstill', 'sabanimosities', 'censurlieve', "description'cam", 'ittandmenti', 'righteousues', 'expomodesty', 'miscontending', 'publithroughout', 'anchshoutings', 'ourdeatlaike', 'pourtuguese', 'ethinpnaocr', 'reignoffered', 'objectionis', 'amuiriciacould', 'causrighteous', 'revealedywill', 'resurrdction', 'presanctifying', 'retnrnedhome', 'magdeburgenses', 'simplicityof', 'idolwiththe', "definition'", 'hiscleansing', 'laocliceans', 'tabernatuary', 'ofjudiement', 'thispromise', 'perfecttabernacle', "god'sblessing", "will'require", 'undoubtedthe', 'examiningthe', 'justifihath', "tabernaand'", 'abolishished', "doubt'after", 'stumbling-blocks', 'reprophetic', 'fulfillrhent', 'aposaccustomed', 'accordagencies', 'desirthough', 'praccurrence', 'minietration', 'judgfurniture', 'interferred', 'standingcorn', "cvrtaiuly'it", 'chronologers', 'proclaimthemselves', 'counterfeitand', 'forlteeping', 'asupplement', 'wordfortheniselves', 'ournaistake', 'theyunderstand', 'therumbling', 'conprophecies', 'theytweltteiohuenuerioint', 'timeiinstead', 'conelusionof', 'decentlyand', 'reconipense', 'tarryrefine', 'confessiontehacksliddn', 'commandedto', 'fearlesshas', 'adsomething', "whereged'speople", 'whethagainst', 'meetmerchants', 'solethnized', "professed't", 'ministrationof', 'applyingthese', "reedived'froin", 'lordpalsied', 'distracttturn', 'destruilion', 'seientlfday'], 10)
Correction 7 -- Separate Squashed Words¶
In [33]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log
prev = cycle
cycle = "correction7"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
verified_tokens = []
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)
tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]
sorted_list_of_words = list(words_sorted_short['token'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = utilities.strip_punct(content)
tokens = utilities.tokenize_text(text)
wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)
replacements = []
for token in tokens:
if not token.lower() in spelling_dictionary:
if len(token) > 15:
if re.search(r"[\-\-\'\"]", token):
pass
else:
split_string = clean.infer_spaces(token, wordcost, maxword)
list_split_string = split_string.split()
if clean.verify_split_string(list_split_string, spelling_dictionary):
replacements.append((token, split_string))
else:
pass
else:
pass
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
PTAR184909XX-V01-04-page6.txt: [('positiveinstitution', 'positive institution')] PTAR184912XX-V01-05-page1.txt: [('transgressorSunder', 'transgressor S under')] PTAR184912XX-V01-05-page2.txt: [('thoroughlyhealed', 'thoroughly healed')] PTAR184912XX-V01-05-page3.txt: [('theinterpretation', 'the interpretation')] PTAR184912XX-V01-05-page8.txt: [('greatandnotabledayefthe', 'great and not a b led a ye f t h e')] PTAR185003XX-V01-07-page6.txt: [('Wordfortheniselves', 'Word for then i selves'), ('anotheresteemeth', 'another esteemeth')] PTAR185003XX-V01-07-page7.txt: [('positivelydecIared', 'positively de c I a red'), ('twoministrations', 'two ministration s'), ('theininistration', 'the in in i st r a t i o n')] PTAR185003XX-V01-08-page3.txt: [('personagesappear', 'person ages appear')] PTAR185003XX-V01-08-page4.txt: [('cleantermination', 'clean termination')] PTAR185003XX-V01-08-page6.txt: [('startingspointot', 'start ing s point o t')] PTAR185003XX-V01-08-page7.txt: [('perfectioifilment', 'perfect i o if i l men t')] PTAR185004XX-V01-09-page7.txt: [('abbathintothefirst', 'a b bath into the first')] PTAR185005XX-V01-10-page3.txt: [('Noonewhohasreadthe', 'No one who has read the')] PTAR185008XX-V01-01-page3.txt: [('commencepectations', 'commence p e c t a t i o n s')] PTAR185008XX-V01-01-page5.txt: [('conaccomplishing', 'con accomplishing')] PTAR185008XX-V01-02-page14.txt: [('ourdiSappointthent', 'our di S a p point then t')] PTAR185008XX-V01-02-page16.txt: [('preparatoryscenes', 'preparatory scenes')] PTAR185008XX-V01-02-page3.txt: [('Bridedisappointed', 'Bride disappointed'), ('inlookingiorthis', 'in looking i or this')] PTAR185008XX-V01-02-page9.txt: [('countersalvation', 'counter salvation')] PTAR185009XX-V01-03-page11.txt: [('politicoreligious', 'p o lit i c o religious')] PTAR185009XX-V01-03-page13.txt: [('perfecttabernacle', 'perfect tabernacle')] PTAR185009XX-V01-03-page4.txt: [('inconceivprofessed', 'in c once iv professed')] PTAR185009XX-V01-03-page9.txt: [('yourconsecration', 'your consecration'), ('histbriclestiMony', 'hi st b r i c l e s t i M o n y')] PTAR185009XX-V01-04-page8.txt: [('Philadelchildren', 'P hi l ad el children')] PTAR185009XX-V01-04-page9.txt: [('trespassoffering', 'trespass offering')] PTAR185009XX-V01-EX-page10.txt: [('LhtetiodhogitnAcal', 'L h t e t i o d h o g i t n A c a l')] PTAR185009XX-V01-EX-page13.txt: [('proclaimthemselves', 'proclaim themselves')] PTAR185009XX-V01-EX-page8.txt: [('eveningsacrifice', 'evening sacrifice')] PTAR185011XX-V01-05-page7.txt: [('symbolicallyderusalem', 'symbol i call y der us a lem')] PTAR185011XX-V01-11-page4.txt: [('encourainfluence', 'en c our a influence')] PTAR1850XXXX-VXX-XX-page13.txt: [('fruitofthearticles', 'fruit of the articles')] PTAR1850XXXX-VXX-XX-page21.txt: [('fearlesssummation', 'fear less sum m a t i o n')] PTAR1850XXXX-VXX-XX-page26.txt: [('ascertainterpositions', 'as certain ter positions')] PTAR1850XXXX-VXX-XX-page28.txt: [('concepthemselves', 'c once p themselves'), ('teachconceptions', 'teach conceptions')] PTAR1850XXXX-VXX-XX-page31.txt: [('understandpromises', 'understand promises')] PTAR1850XXXX-VXX-XX-page34.txt: [('inconceivagainst', 'in c once iv against')] PTAR1850XXXX-VXX-XX-page37.txt: [('counterfeitprovidence', 'counterfeit providence')] PTAR1850XXXX-VXX-XX-page43.txt: [('thereuntoperfect', 'thereunto perfect')] PTAR1850XXXX-VXX-XX-page44.txt: [('trespassoffering', 'trespass offering'), ('forgiveignorance', 'for give ignorance')] PTAR1850XXXX-VXX-XX-page6.txt: [('conaccomplishing', 'con accomplishing')]
Check Correction 7¶
In [34]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/PTAR/correction7 Average verified rate: 0.963552162714036 Average of error rates: 0.03613913043478261 Total token count: 224787
In [35]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[35]:
[("'", 543), ('th', 154), ('ch', 132), ('d', 117), ('n', 83), ('t', 79), ('ver', 75), ('ex', 73), ('e', 72), ('x', 58), ('m', 55), ('w', 53), ("the'", 45), ('ment', 43), ('r', 39), ('tion', 34), ('ly', 31), ('f', 29), ('g', 28), ("and'", 20), ("to'", 18), ('eze', 17), ('ments', 17), ('br', 17), ('ry', 17), ('vt', 15), ('re', 13), ('nant', 12), ('ful', 12), ('tuary', 12), ('tions', 12), ('cy', 11)]
In [ ]: