LibM-OCR-Evaluation-and-Correction
In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt",
"2016-12-07-SDA-place-names.txt",
"2016-12-08-SDA-Vocabulary.txt",
"2017-01-03-place-names.txt",
"2017-02-14-Base-Word-List-SCOWL&KJV.txt",
"2017-02-14-Roman-Numerals.txt",
"2017-03-01-Additional-Approved-Words.txt"
]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "LibM"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)
Baseline¶
In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/baseline Average verified rate: 0.9276951364862356 Average of error rates: 0.08840278796771826 Total token count: 1502679
In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 500 )
Out[11]:
[('ñ', 5427), ('-', 1800), ('re-', 1618), ('con-', 1590), ("'", 1508), ('tion', 1443), ('m', 1251), ('d', 1242), ('¥', 1236), ('in-', 1093), ('w', 937), ('e', 925), (')', 906), ('ment', 864), ('t', 803), ('n', 748), ('de-', 745), ('be-', 695), ('+', 648), ('com-', 643), ('r', 626), ('f', 594), ('pro-', 571), ('sun-', 502)]
Check Special Character Use¶
In [12]:
reports.tokens_with_special_characters(errors_summary)[:200]
Out[12]:
[('ñ', 5427), ('¥', 1236), (')', 906), ('+', 648), ('(', 478), ('/', 388), ('=', 193), ('(affiliated', 173), ('*', 169), ('•', 137), ('ã', 134), ('(affil-', 132), ('(af-', 119), ('_', 99), ('(see', 94), ('¥¥', 76), ('%', 73), ('[the', 72), ('(or', 65), (']', 61), ('¥¥¥', 56), ('(the', 55), ('ó', 52), ('year)', 49), ('[', 49), ('\\', 43), ('newfoundland)', 43), ('carolina)', 43), ('alaska)', 42), ('wyoming)', 42), ('mexico)', 42), ('wisconsin)', 42), ('mississippi)', 42), ('maryland)', 42), ('dakota)', 42), ('island)', 42), ('arizona)', 42), ('(secretary', 40), ('(western)', 39), ('(affili-', 38), ('++', 37), ('o)', 36), ('(southern)', 35), ('(payable', 33), ('ô', 32), ('(continued', 31), ('(west)', 31), ('>', 30), ('¥¥¥¥', 30), ('(s', 30), ('(h', 29), ('`', 29), ('office)', 25), ('longacre)', 25), ('(a', 24), ('england)', 21), ('¥=', 20), ('(and', 19), ('[of', 19), ('[sunday]', 19), ('chesapeake)', 18), ('(which', 18), ('(eastern)', 18), ('southern)', 18), ('m¥', 18), ('(concluded', 18), ('(south)', 17), ('(north)', 17), ('tennessee)', 17), ('¡', 17), ('(east)', 17), ('(northern)', 16), ('„', 16), ('catholic)', 15), ('(i', 14), ('sunday]', 14), ('(washington', 14), ('(tennessee)', 14), ('=¥', 14), ('ñthe', 14), ('(over)', 13), ('(page', 13), ('#', 13), ('(australia)', 13), ('(england)', 13), ('¤', 13), ('\ufeff', 13), ('(roman', 13), ('(france)', 12), ('(italics', 12), ('(nebraska)', 12), ('i%', 12), ('¥¥¥¥¥¥', 12), ('**', 12), ('(massachusetts)', 12), ('(exchange', 11), ('sunday)', 11), ('(if', 11), ('¥-', 11), ('(peru)', 11), ('saskatchewan)', 11), ('(chile)', 11), ('(cross', 11), ('—', 10), ('i¥', 10), ('-¥', 10), ('the¥', 10), ('(western', 10), ('[in', 10), ('¥the', 10), ('(a)', 9), ('(california)', 9), ('(e', 9), ('(subscriptions', 9), ('(may', 9), ('(to', 9), ('i)', 9), ('(i)', 9), ("'ñ", 9), ('accepted)', 9), ('<', 9), ('(your', 9), ('(greater)', 9), ('[a', 9), ('desired)', 9), ('c)', 8), ('(b)', 8), ('th¥', 8), ('¤¤', 8), ('post-office)', 8), ('ña', 8), ('(frontispiece)', 8), ('ãã', 8), ('[not', 8), ('(baptist)', 8), ('excepted)', 8), ('(central)', 8), ("'¥", 7), ('//', 7), ('¥and', 7), ('law]', 7), ('york)', 7), ('¥m', 7), ('day]', 7), ('=m', 7), ('(catholic)', 7), ('church]', 7), ('(signed)', 7), ('©', 7), ('(as', 7), ('ñid', 7), ('[mr', 6), ('(two', 6), ('(pa', 6), ('(minnesota)', 6), ('••', 6), ('/m', 6), ('m=', 6), ('(mr', 6), ('~~', 6), ('=¥¥', 6), ('(n', 6), ('(for', 6), ('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++', 6), ('[for', 6), ('(by', 6), ('(canadian', 6), ('(works', 6), ('(sunday)', 6), ('(rev', 6), ('(new', 6), ('day)', 6), ('(in', 6), ('(lay', 6), ('(of', 6), ('¥¥¥¥¥', 5), ('numbers)', 5), ('ñibid', 5), ('(poetry)', 5), ('<>', 5), ('[civil', 5), ('(civil', 5), ('labor)', 5), ('attach\x8e', 5), ('i/', 5), ('(first', 5), ('}', 5), ('(dec', 5), ('(saturday', 5), ('prescott)', 5), ('[sunday', 5), ('♦', 5), ('(r', 5), ('(john', 5), ('[christ]', 5), ('~', 5), ('ñhon', 5), ('(d', 5), ('laws]', 5), ('++++++++++++++++++++++++++++++++++++++++++++++++++++++++', 5)]
Correction 1 -- Normalize Characters¶
In [13]:
# %load shared_elements/normalize_characters.py
prev = cycle
cycle = "correction1"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
# Substitute for all other dashes
content = re.sub(r"—-—–‑", r"-", content)
# Substitute formatted apostrophe
content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
# Replace all special characters with a space (as these tend to occur at the end of lines)
content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction1 Average verified rate: 0.9373310897970699 Average of error rates: 0.0763136463683052 Total token count: 1492287
In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 1932), ('re-', 1620), ('con-', 1590), ("'", 1578), ('tion', 1446), ('m', 1334), ('d', 1256), ('in-', 1094), ('e', 985), ('w', 952), ('ment', 864), ('t', 837), ('n', 774), ('de-', 745), ('be-', 696), ('r', 663), ('com-', 645), ('f', 624), ('pro-', 572), ('sun-', 505), ('ex-', 450), ('en-', 446), ('tions', 404), ('g', 384), ('ligious', 367), ('per-', 361), ('dis-', 360), ('un-', 357), ('relig-', 351), ('na-', 328), ('gov-', 324), ('ob-', 323), ('chris-', 298), ('govern-', 292), ('x', 265), ('ernment', 260), ('ious', 258), ('ac-', 250), ('erty', 237), ('ance', 236), ('lib-', 236), ('pre-', 235), ('sab-', 234), ('ments', 233), ('ad-', 230), ('reli-', 219), ('tional', 211), ('ligion', 209), ('u', 206), ('im-', 206)]
Correction 2 -- Fix Line Endings¶
In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction2 Average verified rate: 0.9784327331755492 Average of error rates: 0.037992296404988996 Total token count: 1452618
In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 1918), ("'", 1578), ('m', 1330), ('d', 1250), ('e', 976), ('w', 951), ('t', 819), ('n', 772), ('r', 662), ('f', 620), ('g', 381), ('x', 265), ('u', 205), ('k', 188), ('tv', 150), ('th', 118), ('sunday-law', 112), ('--', 111), ('postmaster-general', 106), ('pa', 101), ('sunday-closing', 73), ('z', 72), ('un-american', 72), ('id', 71), ('io', 70), ('statute-books', 66), ('post-offices', 66), ('co', 61), ('church-and-state', 60), ('mo', 60), ('ga', 58), ('va', 56), ('attorney-general', 56), ('ex', 51), ('re', 48), ('sunday-rest', 48), ('tion', 45), ('mm', 42), ('q', 41), ('mt', 41), ('wm', 38), ('pp', 38), ('re-', 38), ('ro', 37), ('charta', 37), ('mi', 36), ('li', 36), ('---', 36), ('present-day', 35), ('religio-political', 33)]
Correction 3 -- Remove extra dashes¶
In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
replacements = []
for token in tokens:
if token[0] is "-":
replacements.append((token, token[1:]))
elif token[-1] is "-":
replacements.append((token, token[:-1]))
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
LibM19060401-V01-01-page1.txt: [('support.-', 'support.')] LibM19060401-V01-01-page10.txt: [('sanc-', 'sanc')] LibM19060401-V01-01-page11.txt: [('Mc-', 'Mc'), ('Mc-', 'Mc')] LibM19060401-V01-01-page12.txt: [('organiza-', 'organiza')] LibM19060401-V01-01-page14.txt: [('altogether."-', 'altogether."')] LibM19060401-V01-01-page15.txt: [('-the', 'the')] LibM19060401-V01-01-page17.txt: [('-II.', 'II.'), ('de-', 'de')] LibM19060401-V01-01-page19.txt: [('-haracterized', 'haracterized')] LibM19060401-V01-01-page21.txt: [('pe-', 'pe')] LibM19060401-V01-01-page27.txt: [('-', '')] LibM19060401-V01-01-page31.txt: [('-', '')] LibM19060401-V01-01-page32.txt: [('-', '')] LibM19060401-V01-01-page34.txt: [('-never', 'never'), ('-', ''), ('-', ''), ('-', ''), ('prin-', 'prin')] LibM19060401-V01-01-page35.txt: [('-', ''), ('-', ''), ('-', ''), ('-gh-l-', 'gh-l-'), ('ner-', 'ner'), ('-', ''), ('-', ''), ('--afilhir', '-afilhir'), ('--', '-'), ('-', ''), ('-', ''), ('-iSW', 'iSW'), ('-', ''), ('-cirm', 'cirm'), ('-', ''), ('--ant', '-ant'), ('-', ''), ('-', ''), ("'e-Ihttnii-ti--", "'e-Ihttnii-ti-"), ('-owl-', 'owl-'), ('-', ''), ('it-', 'it'), ('--', '-'), ('-', ''), ('-ao-te', 'ao-te'), ('-viez', 'viez'), ('-', ''), ('derwee.--', 'derwee.-'), ('..pieLese--', '..pieLese-'), ('o-', 'o'), ('.-', '.'), ('-or--.', 'or--.'), ('tr.-', 'tr.'), ('-', ''), ('-ezel', 'ezel')] LibM19060401-V01-01-page36.txt: [('SURMOUNT-', 'SURMOUNT')] LibM19060401-V01-01-page4.txt: [('opin-', 'opin'), ('-', '')] LibM19060401-V01-01-page6.txt: [('-', '')] LibM19060401-V01-01-page8.txt: [('en-', 'en')] LibM19060701-V01-02-page1.txt: [('support.-', 'support.')] LibM19060701-V01-02-page12.txt: [('LIB-', 'LIB')] LibM19060701-V01-02-page13.txt: [('-', '')] LibM19060701-V01-02-page15.txt: [('com-', 'com')] LibM19060701-V01-02-page18.txt: [('exer-', 'exer')] LibM19060701-V01-02-page27.txt: [('mis-', 'mis')] LibM19060701-V01-02-page3.txt: [('C--', 'C-'), ('----', '---'), ('--', '-'), ('-', ''), ('-----------.', '----------.')] LibM19060701-V01-02-page34.txt: [('-', ''), ('-', ''), ('-', '')] LibM19060701-V01-02-page5.txt: [('coun-', 'coun')] LibM19060701-V01-02-page7.txt: [('-', '')] LibM19061001-V01-03-page11.txt: [('per-', 'per')] LibM19061001-V01-03-page15.txt: [('inn-', 'inn')] LibM19061001-V01-03-page17.txt: [('legal-', 'legal'), ('-', '')] LibM19061001-V01-03-page18.txt: [('-', ''), ('coun-', 'coun')] LibM19061001-V01-03-page19.txt: [('secu-', 'secu')] LibM19061001-V01-03-page20.txt: [('-', ''), ('-', ''), ('-', '')] LibM19061001-V01-03-page24.txt: [('profana-', 'profana')] LibM19061001-V01-03-page26.txt: [('-governor', 'governor')] LibM19061001-V01-03-page27.txt: [('-of', 'of'), ('modifi-', 'modifi')] LibM19061001-V01-03-page28.txt: [('s--', 's-')] LibM19061001-V01-03-page3.txt: [('-earing', 'earing')] LibM19061001-V01-03-page30.txt: [('op-', 'op')] LibM19061001-V01-03-page31.txt: [('Paid-', 'Paid'), ('free-', 'free')] LibM19061001-V01-03-page34.txt: [('-', ''), ('-', ''), ('-', '')] LibM19061001-V01-03-page4.txt: [('rhoreh-and-', 'rhoreh-and')] LibM19061001-V01-03-page8.txt: [('-TOUSES', 'TOUSES')] LibM19061001-V01-03-page9.txt: [('-', '')] LibM19070101-V02-01-page12.txt: [('them-', 'them'), ('.-', '.'), ('-', ''), ('A-', 'A')] LibM19070101-V02-01-page14.txt: [('-conscience', 'conscience')] LibM19070101-V02-01-page17.txt: [('accord-', 'accord')] LibM19070101-V02-01-page18.txt: [('Con-', 'Con')] LibM19070101-V02-01-page2.txt: [('--', '-'), ('-', ''), ('-----', '----'), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('---', '--'), ('-', '')] LibM19070101-V02-01-page21.txt: [('en-', 'en')] LibM19070101-V02-01-page22.txt: [('-', '')] LibM19070101-V02-01-page23.txt: [('Mc-', 'Mc')] LibM19070101-V02-01-page25.txt: [('Postmaster-', 'Postmaster'), ('in-', 'in'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19070101-V02-01-page3.txt: [('gov-', 'gov')] LibM19070101-V02-01-page30.txt: [('-', ''), ('-', '')] LibM19070101-V02-01-page31.txt: [('-', '')] LibM19070101-V02-01-page34.txt: [('-', ''), ('-', ''), ('-', '')] LibM19070101-V02-01-page35.txt: [('-', '')] LibM19070101-V02-01-page7.txt: [('-', '')] LibM19070101-V02-01-page8.txt: [('un-', 'un')] LibM19070401-V02-02-page12.txt: [('hun-', 'hun')] LibM19070401-V02-02-page13.txt: [('Sunday-', 'Sunday')] LibM19070401-V02-02-page14.txt: [('Sun-', 'Sun')] LibM19070401-V02-02-page16.txt: [('-', ''), ('enfor-', 'enfor')] LibM19070401-V02-02-page17.txt: [('LAN-', 'LAN')] LibM19070401-V02-02-page18.txt: [('men.-', 'men.')] LibM19070401-V02-02-page19.txt: [('-', '')] LibM19070401-V02-02-page22.txt: [('Medo-', 'Medo')] LibM19070401-V02-02-page26.txt: [('consulted.-', 'consulted.')] LibM19070401-V02-02-page27.txt: [('--', '-'), ('---', '--'), ('-', ''), ('-', ''), ('..-', '..'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.j.i.i.', '.j.i.i.'), ('-f', 'f'), ('-', ''), ('-', ''), ('-I', 'I'), ('I-', 'I'), ("-'il", "'il")] LibM19070401-V02-02-page3.txt: [('Chi-', 'Chi'), ('-', '')] LibM19070401-V02-02-page32.txt: [('-', '')] LibM19070401-V02-02-page34.txt: [('-', ''), ('-', ''), ('-', '')] LibM19070401-V02-02-page4.txt: [('-', ''), ('Lewis-', 'Lewis'), ('-', '')] LibM19070401-V02-02-page6.txt: [('an-', 'an')] LibM19070701-V02-03-page14.txt: [('-', ''), ('rec-', 'rec')] LibM19070701-V02-03-page15.txt: [('-', ''), ('-', ''), ('at-', 'at')] LibM19070701-V02-03-page17.txt: [('Indepen-', 'Indepen')] LibM19070701-V02-03-page18.txt: [('agi-', 'agi')] LibM19070701-V02-03-page19.txt: [('James-', 'James')] LibM19070701-V02-03-page20.txt: [('Chris-', 'Chris')] LibM19070701-V02-03-page21.txt: [('-', ''), ('-', ''), ('-', '')] LibM19070701-V02-03-page22.txt: [('-', ''), ('FOUNDATIONS.-', 'FOUNDATIONS.')] LibM19070701-V02-03-page23.txt: [('Mc-', 'Mc'), ('Mc-', 'Mc')] LibM19070701-V02-03-page24.txt: [('-', '')] LibM19070701-V02-03-page25.txt: [('-', '')] LibM19070701-V02-03-page27.txt: [('-', '')] LibM19070701-V02-03-page28.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19070701-V02-03-page3.txt: [('Protes-', 'Protes')] LibM19070701-V02-03-page34.txt: [('-', ''), ('-', ''), ('-', ''), ('Act-of-', 'Act-of')] LibM19070701-V02-03-page8.txt: [('LIB-', 'LIB'), ('-', '')] LibM19070701-V02-03-page9.txt: [('As-', 'As'), ('-', '')] LibM19071001-V02-04-page10.txt: [('cor-', 'cor')] LibM19071001-V02-04-page12.txt: [('-that', 'that')] LibM19071001-V02-04-page14.txt: [('--', '-'), ('Vice-', 'Vice')] LibM19071001-V02-04-page15.txt: [('con-', 'con')] LibM19071001-V02-04-page17.txt: [('in-', 'in'), ('-', ''), ('-', '')] LibM19071001-V02-04-page18.txt: [('-', '')] LibM19071001-V02-04-page19.txt: [('-', '')] LibM19071001-V02-04-page20.txt: [('-', '')] LibM19071001-V02-04-page21.txt: [('doc-', 'doc')] LibM19071001-V02-04-page22.txt: [('-', '')] LibM19071001-V02-04-page23.txt: [('-', '')] LibM19071001-V02-04-page24.txt: [('rneas-', 'rneas')] LibM19071001-V02-04-page26.txt: [('----', '---'), ('-', '')] LibM19071001-V02-04-page27.txt: [('-', '')] LibM19071001-V02-04-page28.txt: [('-', '')] LibM19071001-V02-04-page29.txt: [('Record-', 'Record')] LibM19071001-V02-04-page30.txt: [('decep-', 'decep'), ('-', ''), ('-', '')] LibM19071001-V02-04-page31.txt: [('-', ''), ('-', '')] LibM19071001-V02-04-page32.txt: [('law-', 'law'), ('impor-', 'impor'), ('Sab-', 'Sab'), ('-orb', 'orb'), ('re-', 're'), ('-', '')] LibM19071001-V02-04-page38.txt: [('-', ''), ('essen-', 'essen')] LibM19071001-V02-04-page40.txt: [('-', '')] LibM19071001-V02-04-page44.txt: [('--', '-')] LibM19071001-V02-04-page46.txt: [('-', '')] LibM19071001-V02-04-page48.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Sov-', 'Sov'), ('-e', 'e'), ('-', '')] LibM19071001-V02-04-page49.txt: [('-page', 'page'), ('-page', 'page')] LibM19071001-V02-04-page50.txt: [('-', ''), ('-', ''), ('-', '')] LibM19071001-V02-04-page51.txt: [('-THAT', 'THAT'), ('-', ''), ('lande.-', 'lande.'), ('temert.-', 'temert.'), ('-', ''), ('Colooiso.-', 'Colooiso.'), ('-', ''), ('velour...ref.-', 'velour...ref.'), ('hands.-', 'hands.'), ('people.-', 'people.'), ('-They', 'They'), ('-W', 'W'), ('-TTE', 'TTE')] LibM19071001-V02-04-page8.txt: [('hier-', 'hier')] LibM19080101-V03-01-page1.txt: [('-', ''), ('-.', '.')] LibM19080101-V03-01-page14.txt: [('-', '')] LibM19080101-V03-01-page15.txt: [('-', '')] LibM19080101-V03-01-page20.txt: [('un-', 'un'), ('un-', 'un'), ('presi-', 'presi')] LibM19080101-V03-01-page22.txt: [('set-', 'set')] LibM19080101-V03-01-page23.txt: [('-in', 'in')] LibM19080101-V03-01-page24.txt: [('-', '')] LibM19080101-V03-01-page26.txt: [('PRES-', 'PRES')] LibM19080101-V03-01-page32.txt: [('in-', 'in')] LibM19080101-V03-01-page34.txt: [('Ren-', 'Ren')] LibM19080101-V03-01-page36.txt: [('haz-', 'haz')] LibM19080101-V03-01-page39.txt: [('de-', 'de')] LibM19080101-V03-01-page40.txt: [('docu-', 'docu')] LibM19080101-V03-01-page41.txt: [('self-govern-', 'self-govern')] LibM19080101-V03-01-page47.txt: [('As--', 'As-')] LibM19080101-V03-01-page48.txt: [('-', ''), ('Under-', 'Under'), ('-', ''), ('-', ''), ('Sab-', 'Sab'), ('-', ''), ('Sov-', 'Sov'), ('-', ''), ('-', '')] LibM19080101-V03-01-page49.txt: [('-page', 'page')] LibM19080101-V03-01-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('LIB-', 'LIB')] LibM19080101-V03-01-page51.txt: [('-..ter', '..ter'), ('-', '')] LibM19080101-V03-01-page6.txt: [('gen-', 'gen')] LibM19080101-V03-01-page8.txt: [('-', '')] LibM19080401-V03-02-page1.txt: [('.-q"P--', '.-q"P-'), ('mutummimmomminumummummumunimmiumummummlimummmumumunummtimummimintowitmmummrx--.-', 'mutummimmomminumummummumunimmiumummummlimummmumumunummtimummimintowitmmummrx--.'), ('-', ''), ('-.-', '.-'), ('-', ''), ('-TuaDCII', 'TuaDCII')] LibM19080401-V03-02-page11.txt: [('gov-', 'gov')] LibM19080401-V03-02-page12.txt: [('-', '')] LibM19080401-V03-02-page14.txt: [('-', '')] LibM19080401-V03-02-page16.txt: [('-', ''), ('sab-', 'sab')] LibM19080401-V03-02-page18.txt: [('-day', 'day'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Georgia-', 'Georgia'), ('-', ''), ('Illinois-', 'Illinois'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19080401-V03-02-page21.txt: [('con-', 'con')] LibM19080401-V03-02-page24.txt: [('-', ''), ('suc-', 'suc')] LibM19080401-V03-02-page3.txt: [('pos-', 'pos')] LibM19080401-V03-02-page30.txt: [('Postmaster-', 'Postmaster'), ('la-', 'la')] LibM19080401-V03-02-page35.txt: [('-', '')] LibM19080401-V03-02-page36.txt: [('re-', 're')] LibM19080401-V03-02-page41.txt: [('com-', 'com')] LibM19080401-V03-02-page43.txt: [('-as', 'as'), ('-', ''), ('tol-', 'tol')] LibM19080401-V03-02-page44.txt: [('every-', 'every')] LibM19080401-V03-02-page48.txt: [('Under-', 'Under'), ('Sab-', 'Sab'), ('Sov-', 'Sov')] LibM19080401-V03-02-page50.txt: [('-', '')] LibM19080401-V03-02-page51.txt: [('ntitzu-', 'ntitzu'), ('-', '')] LibM19080401-V03-02-page9.txt: [('-', ''), ('gov-', 'gov')] LibM19080701-V03-03-page1.txt: [('--ff', '-ff'), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('muminatatimiumumuutumitimmittimmummminnumminumuffiummumummunnomiminummuummummimmumnini-', 'muminatatimiumumuutumitimmittimmummminnumminumuffiummumummunnomiminummuummummimmumnini'), ('-Z', 'Z'), ('-', ''), ('---v', '--v')] LibM19080701-V03-03-page11.txt: [('-', '')] LibM19080701-V03-03-page14.txt: [('free-', 'free')] LibM19080701-V03-03-page20.txt: [('-', ''), ('-', ''), ('-', ''), ('re-', 're')] LibM19080701-V03-03-page21.txt: [('-', ''), ('-', ''), ('-', '')] LibM19080701-V03-03-page22.txt: [('-', ''), ('I-', 'I')] LibM19080701-V03-03-page26.txt: [('-', '')] LibM19080701-V03-03-page28.txt: [('-', ''), ('-', ''), ('-', '')] LibM19080701-V03-03-page29.txt: [('-', '')] LibM19080701-V03-03-page30.txt: [('-', '')] LibM19080701-V03-03-page31.txt: [('-o', 'o'), ('-', '')] LibM19080701-V03-03-page32.txt: [('na-', 'na')] LibM19080701-V03-03-page39.txt: [('opin-', 'opin')] LibM19080701-V03-03-page40.txt: [('Con-', 'Con'), ('-President', 'President')] LibM19080701-V03-03-page43.txt: [('-finest', 'finest'), ('uni-', 'uni'), ('-versal', 'versal')] LibM19080701-V03-03-page45.txt: [('prohib-', 'prohib')] LibM19080701-V03-03-page47.txt: [('-', '')] LibM19080701-V03-03-page48.txt: [('Revelation.-', 'Revelation.'), ('-', ''), ('-', ''), ('Under-', 'Under'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Sov-', 'Sov'), ('Christ-', 'Christ')] LibM19080701-V03-03-page49.txt: [('-', '')] LibM19080701-V03-03-page5.txt: [('con-', 'con')] LibM19080701-V03-03-page50.txt: [('-', '')] LibM19080701-V03-03-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19080701-V03-03-page6.txt: [('pas-', 'pas')] LibM19080701-V03-03-page7.txt: [('-', ''), ('-the', 'the')] LibM19080701-V03-03-page9.txt: [('Post-', 'Post')] LibM19081001-V03-04-page1.txt: [('E-', 'E'), ('-', ''), ('-rI', 'rI'), ('-', ''), ('-', '')] LibM19081001-V03-04-page13.txt: [('ef-', 'ef')] LibM19081001-V03-04-page14.txt: [('-', ''), ('state-', 'state')] LibM19081001-V03-04-page15.txt: [('-', '')] LibM19081001-V03-04-page18.txt: [('In-', 'In')] LibM19081001-V03-04-page19.txt: [('-', '')] LibM19081001-V03-04-page20.txt: [('-', '')] LibM19081001-V03-04-page22.txt: [('es-', 'es'), ('for-', 'for')] LibM19081001-V03-04-page23.txt: [('sab-', 'sab'), ('some-', 'some')] LibM19081001-V03-04-page26.txt: [('-hall', 'hall')] LibM19081001-V03-04-page33.txt: [('--', '-')] LibM19081001-V03-04-page34.txt: [('tend-', 'tend'), ('stri-', 'stri')] LibM19081001-V03-04-page37.txt: [('punment.--', 'punment.-'), ('imprison-', 'imprison'), ('--', '-')] LibM19081001-V03-04-page39.txt: [('-', '')] LibM19081001-V03-04-page4.txt: [('com-', 'com')] LibM19081001-V03-04-page40.txt: [('com-', 'com')] LibM19081001-V03-04-page41.txt: [('remem-', 'remem'), ('-which', 'which')] LibM19081001-V03-04-page43.txt: [('pecul-', 'pecul')] LibM19081001-V03-04-page44.txt: [('un-', 'un'), ('-', '')] LibM19081001-V03-04-page47.txt: [('repu-', 'repu')] LibM19081001-V03-04-page49.txt: [('-', ''), ('-', '')] LibM19081001-V03-04-page50.txt: [('-', ''), ('-', '')] LibM19081001-V03-04-page52.txt: [('-', '')] LibM19081001-V03-04-page8.txt: [('-', '')] LibM19081001-V03-04-page9.txt: [('un-', 'un')] LibM19090101-V04-01-page1.txt: [('-', ''), ('.....-', '.....'), ('-', ''), ('-....i"', '....i"'), ('k..a...--', 'k..a...-'), ('-', ''), ('-', ''), ('-', '')] LibM19090101-V04-01-page17.txt: [('-', ''), ('con-', 'con')] LibM19090101-V04-01-page18.txt: [('-', '')] LibM19090101-V04-01-page21.txt: [('-', ''), ('-', '')] LibM19090101-V04-01-page24.txt: [('-.', '.'), ('.-', '.')] LibM19090101-V04-01-page26.txt: [('con-', 'con')] LibM19090101-V04-01-page3.txt: [('relig-', 'relig')] LibM19090101-V04-01-page30.txt: [('-', '')] LibM19090101-V04-01-page33.txt: [('-', ''), ('na-', 'na'), ('insti-', 'insti'), ('r--', 'r-')] LibM19090101-V04-01-page36.txt: [('-', '')] LibM19090101-V04-01-page37.txt: [('-.', '.'), ('.-', '.')] LibM19090101-V04-01-page40.txt: [('founda-', 'founda')] LibM19090101-V04-01-page41.txt: [('per-', 'per')] LibM19090101-V04-01-page44.txt: [('in-', 'in'), ('disor-', 'disor')] LibM19090101-V04-01-page45.txt: [('be-', 'be'), ('Mc-', 'Mc')] LibM19090101-V04-01-page47.txt: [('-', ''), ('-', '')] LibM19090101-V04-01-page49.txt: [('Post-', 'Post'), ('-', ''), ('-', '')] LibM19090101-V04-01-page5.txt: [('-', '')] LibM19090101-V04-01-page50.txt: [('-', '')] LibM19090101-V04-01-page52.txt: [('-', ''), ('-', '')] LibM19090101-V04-01-page7.txt: [('SECRE-', 'SECRE')] LibM19090401-V04-02-page1.txt: [('-', ''), ('-L', 'L'), ('-', ''), ('-', ''), ('---mussuaillir', '--mussuaillir'), ('-', '')] LibM19090401-V04-02-page10.txt: [('scru-', 'scru')] LibM19090401-V04-02-page12.txt: [('-', ''), ('Gib-', 'Gib'), ('in-', 'in')] LibM19090401-V04-02-page2.txt: [('-hi', 'hi'), ('-', ''), ('-ss', 'ss'), ('e-', 'e'), ('-', ''), ('-', ''), ('ja-', 'ja'), ('ace-', 'ace'), ('-Q.s.-', 'Q.s.-'), ('Ca-', 'Ca'), ('-', ''), ('-', ''), ('.....-', '.....'), ('-', ''), ('-', ''), ('-', ''), ('-V', 'V'), ('-', ''), ('-', ''), ('-', ''), ('-dte-y', 'dte-y'), ('-', ''), ('-c.x', 'c.x'), ('-eed', 'eed'), ('-', ''), ('rt-', 'rt'), ('-', ''), ('-', ''), ('-', ''), ("'r-", "'r"), ('-n', 'n')] LibM19090401-V04-02-page20.txt: [('Vir-', 'Vir')] LibM19090401-V04-02-page23.txt: [('-', ''), ('As-', 'As')] LibM19090401-V04-02-page30.txt: [('--', '-'), ('gov-', 'gov')] LibM19090401-V04-02-page36.txt: [('-t', 't')] LibM19090401-V04-02-page38.txt: [('Rich-', 'Rich'), ('neverthe-', 'neverthe')] LibM19090401-V04-02-page45.txt: [('-', ''), ('con-', 'con'), ('-t', 't'), ('-', ''), ('-eta', 'eta'), ('.-', '.'), ('-', ''), ('aforexo.-', 'aforexo.')] LibM19090401-V04-02-page46.txt: [('-', ''), ('mur-', 'mur'), ('-', ''), ('-.', '.')] LibM19090401-V04-02-page48.txt: [('PEAR-', 'PEAR'), ('-', ''), ('-', '')] LibM19090401-V04-02-page49.txt: [('Post-', 'Post'), ('-', ''), ('-', '')] LibM19090401-V04-02-page5.txt: [('-.', '.'), ('.-', '.')] LibM19090401-V04-02-page50.txt: [('-', '')] LibM19090401-V04-02-page51.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', '')] LibM19090401-V04-02-page8.txt: [('Mc-', 'Mc')] LibM19090401-V04-02-page9.txt: [('-', ''), ('acknowl-', 'acknowl')] LibM19090701-V04-03-page1.txt: [('-"', '"'), ('-lib', 'lib'), ('..ILI--', '..ILI-')] LibM19090701-V04-03-page10.txt: [('RECEP-', 'RECEP')] LibM19090701-V04-03-page11.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('an-', 'an'), ('at--', 'at-')] LibM19090701-V04-03-page13.txt: [('Con-', 'Con'), ('-', '')] LibM19090701-V04-03-page14.txt: [('-', '')] LibM19090701-V04-03-page2.txt: [('-', ''), ('-', ''), ('-', '')] LibM19090701-V04-03-page22.txt: [('Russian-', 'Russian')] LibM19090701-V04-03-page23.txt: [('-', '')] LibM19090701-V04-03-page26.txt: [('-', '')] LibM19090701-V04-03-page29.txt: [('time-hon-', 'time-hon')] LibM19090701-V04-03-page3.txt: [('mat-', 'mat')] LibM19090701-V04-03-page30.txt: [('-', '')] LibM19090701-V04-03-page32.txt: [('de-', 'de')] LibM19090701-V04-03-page33.txt: [('non-', 'non'), ('pro-', 'pro')] LibM19090701-V04-03-page34.txt: [('Hu-', 'Hu'), ('CHRIS-', 'CHRIS'), ('be-', 'be')] LibM19090701-V04-03-page36.txt: [('--', '-')] LibM19090701-V04-03-page37.txt: [('there-', 'there'), ('re-', 're')] LibM19090701-V04-03-page40.txt: [('en-', 'en')] LibM19090701-V04-03-page42.txt: [('-taptimi', 'taptimi'), ('-thifii', 'thifii'), ('-', ''), ('trinn-', 'trinn'), ('-fihAt', 'fihAt'), ('Yr-', 'Yr')] LibM19090701-V04-03-page44.txt: [('en-', 'en'), ('Anti-', 'Anti')] LibM19090701-V04-03-page48.txt: [('Post-', 'Post'), ('Cook-', 'Cook')] LibM19090701-V04-03-page49.txt: [('-', ''), ('-', ''), ('APPEAR-', 'APPEAR')] LibM19090701-V04-03-page5.txt: [('govern-', 'govern')] LibM19090701-V04-03-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19090701-V04-03-page51.txt: [('-', '')] LibM19090701-V04-03-page52.txt: [('-', ''), ('-', '')] LibM19090701-V04-03-page6.txt: [('pro-', 'pro')] LibM19090701-V04-03-page7.txt: [('Mc-', 'Mc')] LibM19090701-V04-03-page9.txt: [('-', ''), ('-', '')] LibM19091001-V04-04-page10.txt: [('af-', 'af')] LibM19091001-V04-04-page11.txt: [('gov-', 'gov'), ('horse-', 'horse')] LibM19091001-V04-04-page13.txt: [('be-', 'be')] LibM19091001-V04-04-page14.txt: [('af-', 'af')] LibM19091001-V04-04-page15.txt: [('R-', 'R')] LibM19091001-V04-04-page16.txt: [('-.', '.'), ('.-', '.'), ('spiritu-', 'spiritu')] LibM19091001-V04-04-page17.txt: [('es-', 'es')] LibM19091001-V04-04-page18.txt: [('di-', 'di')] LibM19091001-V04-04-page19.txt: [('-', '')] LibM19091001-V04-04-page2.txt: [('Au-', 'Au'), ('Post-', 'Post')] LibM19091001-V04-04-page21.txt: [('-', '')] LibM19091001-V04-04-page22.txt: [('-', ''), ('-', ''), ('anti-', 'anti'), ('-', ''), ('-', ''), ('Mc-', 'Mc')] LibM19091001-V04-04-page23.txt: [('A.-', 'A.')] LibM19091001-V04-04-page25.txt: [('.T-', '.T')] LibM19091001-V04-04-page3.txt: [('-', '')] LibM19091001-V04-04-page30.txt: [('Ware-', 'Ware'), ('-the', 'the')] LibM19091001-V04-04-page31.txt: [('-', ''), ('CON-', 'CON')] LibM19091001-V04-04-page32.txt: [('-', ''), ('-', '')] LibM19091001-V04-04-page35.txt: [('de-', 'de')] LibM19091001-V04-04-page36.txt: [('-', '')] LibM19091001-V04-04-page38.txt: [('b-', 'b'), ('phrase-', 'phrase'), ('-', '')] LibM19091001-V04-04-page39.txt: [('non-', 'non')] LibM19091001-V04-04-page4.txt: [('para-', 'para')] LibM19091001-V04-04-page45.txt: [('finan-', 'finan')] LibM19091001-V04-04-page47.txt: [('APPEAR-', 'APPEAR')] LibM19091001-V04-04-page48.txt: [('-', ''), ('-', ''), ('-', '')] LibM19091001-V04-04-page7.txt: [('Mc-', 'Mc'), ('differen-', 'differen')] LibM19091001-V04-04-page8.txt: [('-', '')] LibM19091001-V04-04-page9.txt: [('-', '')] LibM19100101-V05-01-page1.txt: [('r-', 'r'), ('-.', '.'), ('.-', '.')] LibM19100101-V05-01-page11.txt: [('thou-', 'thou')] LibM19100101-V05-01-page13.txt: [('-', ''), ('Ad-', 'Ad')] LibM19100101-V05-01-page14.txt: [('WASH-', 'WASH'), ('RE-', 'RE'), ('mem-', 'mem')] LibM19100101-V05-01-page17.txt: [('Mc-', 'Mc'), ('Secretary-of-', 'Secretary-of')] LibM19100101-V05-01-page19.txt: [('incon-', 'incon')] LibM19100101-V05-01-page2.txt: [('-', ''), ('-', ''), ('-', '')] LibM19100101-V05-01-page20.txt: [('com-', 'com')] LibM19100101-V05-01-page21.txt: [('sup-', 'sup')] LibM19100101-V05-01-page23.txt: [('free-', 'free')] LibM19100101-V05-01-page24.txt: [('Chris-', 'Chris')] LibM19100101-V05-01-page27.txt: [('-', '')] LibM19100101-V05-01-page31.txt: [('-', ''), ('-', ''), ('--', '-'), ('--', '-')] LibM19100101-V05-01-page32.txt: [('guar-', 'guar'), ('Postmaster-', 'Postmaster')] LibM19100101-V05-01-page33.txt: [('-Edward', 'Edward'), ('des-', 'des')] LibM19100101-V05-01-page34.txt: [('Anti-', 'Anti')] LibM19100101-V05-01-page35.txt: [('com-', 'com')] LibM19100101-V05-01-page36.txt: [('-', ''), ('-"', '"'), ('-', '')] LibM19100101-V05-01-page37.txt: [('separa-', 'separa')] LibM19100101-V05-01-page39.txt: [('-Z', 'Z'), ('-.E', '.E'), ('-', ''), ('-A', 'A')] LibM19100101-V05-01-page42.txt: [('-', '')] LibM19100101-V05-01-page45.txt: [('-', '')] LibM19100101-V05-01-page46.txt: [('-', ''), ('over-', 'over')] LibM19100101-V05-01-page47.txt: [('-', '')] LibM19100101-V05-01-page48.txt: [('-', '')] LibM19100101-V05-01-page49.txt: [('sp-', 'sp'), ('-', ''), ('-', '')] LibM19100101-V05-01-page50.txt: [('-', ''), ('Artaa.--', 'Artaa.-')] LibM19100101-V05-01-page6.txt: [('-', '')] LibM19100101-V05-01-page7.txt: [('ap-', 'ap'), ('dis-', 'dis'), ('Cath-', 'Cath')] LibM19100401-V05-02-page1.txt: [('...m..."..--', '...m..."..-'), ('.-', '.'), ('--mommumniummunuimiumuutimutimmulummimmiummintomunmumumummumumumnomminuninumninummumumummtuntiummirt', '-mommumniummunuimiumuutimutimmulummimmiummintomunmumumummumumumnomminuninumninummumumummtuntiummirt'), ('-.', '.'), ('-', ''), ('-', ''), ("-'-", "'-"), ('-j', 'j'), ("--S-'''", "-S-'''"), ('--', '-')] LibM19100401-V05-02-page12.txt: [('enforce-', 'enforce'), ('op-', 'op')] LibM19100401-V05-02-page13.txt: [('-', ''), ('-', '')] LibM19100401-V05-02-page16.txt: [('Mary-', 'Mary')] LibM19100401-V05-02-page18.txt: [('-', '')] LibM19100401-V05-02-page2.txt: [('-', ''), ('-', ''), ('-', '')] LibM19100401-V05-02-page21.txt: [('sun-', 'sun')] LibM19100401-V05-02-page24.txt: [('at-', 'at'), ('-church', 'church')] LibM19100401-V05-02-page25.txt: [('trans-', 'trans')] LibM19100401-V05-02-page26.txt: [('in-', 'in')] LibM19100401-V05-02-page27.txt: [('ex-', 'ex')] LibM19100401-V05-02-page3.txt: [('-PR', 'PR')] LibM19100401-V05-02-page35.txt: [('-', ''), ('-friEHORRoki', 'friEHORRoki'), ('-CHER', 'CHER')] LibM19100401-V05-02-page38.txt: [('Sun-', 'Sun')] LibM19100401-V05-02-page40.txt: [('advo-', 'advo')] LibM19100401-V05-02-page46.txt: [('re-', 're')] LibM19100401-V05-02-page48.txt: [('Teach-', 'Teach')] LibM19100401-V05-02-page49.txt: [('-', '')] LibM19100401-V05-02-page5.txt: [('LIB-', 'LIB')] LibM19100401-V05-02-page50.txt: [('-', ''), ('ac-', 'ac')] LibM19100401-V05-02-page52.txt: [('-', ''), ('legisla-', 'legisla'), ('Jan-', 'Jan')] LibM19100401-V05-02-page6.txt: [('-', '')] LibM19100401-V05-02-page8.txt: [('PRESI-', 'PRESI')] LibM19100701-V05-03-page17.txt: [('pros-', 'pros')] LibM19100701-V05-03-page18.txt: [('ex-', 'ex')] LibM19100701-V05-03-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('Lafayette-', 'Lafayette'), ('-', '')] LibM19100701-V05-03-page20.txt: [('-', ''), ('-', '')] LibM19100701-V05-03-page21.txt: [('cus-', 'cus'), ('-', '')] LibM19100701-V05-03-page22.txt: [('-', '')] LibM19100701-V05-03-page23.txt: [('Cath-', 'Cath')] LibM19100701-V05-03-page24.txt: [('-', '')] LibM19100701-V05-03-page26.txt: [('prin-', 'prin')] LibM19100701-V05-03-page28.txt: [('non-', 'non'), ('insti-', 'insti'), ('re-', 're')] LibM19100701-V05-03-page30.txt: [('-', ''), ('-I', 'I')] LibM19100701-V05-03-page32.txt: [('An-', 'An'), ('Gen-', 'Gen')] LibM19100701-V05-03-page34.txt: [('-', ''), ('meet-', 'meet')] LibM19100701-V05-03-page35.txt: [('Li-', 'Li'), ('circula-', 'circula'), ('Pro-', 'Pro')] LibM19100701-V05-03-page37.txt: [('ERRON-', 'ERRON'), ('HISTOR-', 'HISTOR'), ('PRAC-', 'PRAC'), ('-', '')] LibM19100701-V05-03-page40.txt: [('--', '-')] LibM19100701-V05-03-page46.txt: [('Anti-', 'Anti')] LibM19100701-V05-03-page49.txt: [('PROTES-', 'PROTES'), ('MAG-', 'MAG'), ('Roosevelt-', 'Roosevelt'), ('-', '')] LibM19100701-V05-03-page5.txt: [('Vat-', 'Vat')] LibM19100701-V05-03-page50.txt: [('-', '')] LibM19100701-V05-03-page52.txt: [('Inter-', 'Inter'), ('Post-', 'Post')] LibM19100701-V05-03-page7.txt: [('Mc-', 'Mc')] LibM19101001-V05-04-page1.txt: [('-', ''), ('-ANIMMIIMMIMMIIMMIIMIMMIWIMMUMWHIMOMMOMMIIMMIHMUMMIMIUMMIMMEMIIMMUMMUMMENUMIIIIMMUUMMINUMMIS', 'ANIMMIIMMIMMIIMMIIMIMMIWIMMUMWHIMOMMOMMIIMMIHMUMMIMIUMMIMMEMIIMMUMMUMMENUMIIIIMMUUMMINUMMIS'), ('st-', 'st'), ('-..-...', '..-...'), ('-X', 'X'), ('"-', '"'), ('-', ''), ('r.-', 'r.'), ('-', ''), ('-', ''), ('---', '--')] LibM19101001-V05-04-page10.txt: [('-under', 'under')] LibM19101001-V05-04-page11.txt: [('-authority', 'authority')] LibM19101001-V05-04-page13.txt: [('-', '')] LibM19101001-V05-04-page15.txt: [('-', '')] LibM19101001-V05-04-page16.txt: [('gov-', 'gov')] LibM19101001-V05-04-page19.txt: [('-', ''), ('spir-', 'spir')] LibM19101001-V05-04-page2.txt: [('-', ''), ('-', ''), ('-S', 'S')] LibM19101001-V05-04-page21.txt: [('-', ''), ('OPEN-', 'OPEN')] LibM19101001-V05-04-page23.txt: [('OPEN-', 'OPEN'), ('gov-', 'gov')] LibM19101001-V05-04-page24.txt: [('MON-', 'MON')] LibM19101001-V05-04-page25.txt: [('hon-', 'hon')] LibM19101001-V05-04-page26.txt: [('sig-', 'sig')] LibM19101001-V05-04-page28.txt: [('MON-', 'MON'), ('char-', 'char'), ('in-', 'in'), ('L-', 'L')] LibM19101001-V05-04-page29.txt: [('-', ''), ('interna-', 'interna')] LibM19101001-V05-04-page30.txt: [('com-', 'com')] LibM19101001-V05-04-page32.txt: [('antipedo-', 'antipedo')] LibM19101001-V05-04-page34.txt: [('fear-', 'fear'), ('-', '')] LibM19101001-V05-04-page35.txt: [('consola-', 'consola')] LibM19101001-V05-04-page36.txt: [('-', '')] LibM19101001-V05-04-page39.txt: [('y-', 'y')] LibM19101001-V05-04-page42.txt: [('Zapnath-', 'Zapnath'), ('-"', '"'), ('Tel-el-', 'Tel-el')] LibM19101001-V05-04-page43.txt: [('de-', 'de')] LibM19101001-V05-04-page49.txt: [('PROTES-', 'PROTES'), ('MAG-', 'MAG'), ('Roosevelt-', 'Roosevelt')] LibM19101001-V05-04-page5.txt: [('-.', '.'), ('-', '')] LibM19101001-V05-04-page50.txt: [('-', '')] LibM19101001-V05-04-page51.txt: [('-', '')] LibM19101001-V05-04-page8.txt: [('-', '')] LibM19101001-V05-04-page9.txt: [('-America', 'America')] LibM19110101-V06-01-page1.txt: [('-...ffiummummiummunnummumummmumumummummunamummunummuumummmunummunnummumummumnitumnims', '...ffiummummiummunnummumummmumumummummunamummunummuumummmunummunnummumummumnitumnims'), ('-"C""', '"C""'), ('-', ''), ('-', ''), ('-', ''), ('Z---', 'Z--'), ('-', ''), ('.---', '.--'), ('-.', '.'), ('-', ''), ('ir-', 'ir'), ('-', ''), ('"nrnurilillpii"-', '"nrnurilillpii"')] LibM19110101-V06-01-page11.txt: [('-as', 'as'), ('desire-', 'desire')] LibM19110101-V06-01-page12.txt: [('sum-', 'sum')] LibM19110101-V06-01-page13.txt: [('-', '')] LibM19110101-V06-01-page15.txt: [('-', '')] LibM19110101-V06-01-page18.txt: [('enforce-', 'enforce'), ('Mc-', 'Mc')] LibM19110101-V06-01-page2.txt: [('-', ''), ('-', ''), ('-', '')] LibM19110101-V06-01-page20.txt: [('ac-', 'ac')] LibM19110101-V06-01-page22.txt: [('-i', 'i'), ('-', ''), ('-', '')] LibM19110101-V06-01-page23.txt: [('-', '')] LibM19110101-V06-01-page27.txt: [('-', '')] LibM19110101-V06-01-page29.txt: [('contra-', 'contra')] LibM19110101-V06-01-page31.txt: [('par-', 'par')] LibM19110101-V06-01-page34.txt: [('RE-', 'RE'), ('-great', 'great')] LibM19110101-V06-01-page35.txt: [('lib-', 'lib')] LibM19110101-V06-01-page36.txt: [('-', '')] LibM19110101-V06-01-page42.txt: [('-', '')] LibM19110101-V06-01-page43.txt: [('-', '')] LibM19110101-V06-01-page49.txt: [('-', '')] LibM19110101-V06-01-page5.txt: [('differ-', 'differ')] LibM19110101-V06-01-page50.txt: [('-', ''), ('-', ''), ('-', '')] LibM19110101-V06-01-page6.txt: [('-', ''), ('--', '-'), ('-', ''), ('-', '')] LibM19110101-V06-01-page7.txt: [('po-', 'po')] LibM19110101-V06-01-page8.txt: [('Latin-', 'Latin')] LibM19110101-V06-01-page9.txt: [('--', '-'), ('-', '')] LibM19110401-V06-02-page1.txt: [('-', '')] LibM19110401-V06-02-page11.txt: [('ac-', 'ac')] LibM19110401-V06-02-page12.txt: [('employ-', 'employ')] LibM19110401-V06-02-page13.txt: [('oc-', 'oc'), ('legiti-', 'legiti')] LibM19110401-V06-02-page14.txt: [('meas-', 'meas')] LibM19110401-V06-02-page16.txt: [('nec-', 'nec')] LibM19110401-V06-02-page18.txt: [('UNI-', 'UNI'), ('labor-', 'labor')] LibM19110401-V06-02-page2.txt: [('-', ''), ('-', ''), ('-', '')] LibM19110401-V06-02-page20.txt: [('en-', 'en')] LibM19110401-V06-02-page26.txt: [('varia-', 'varia'), ('-', '')] LibM19110401-V06-02-page27.txt: [('offi-', 'offi')] LibM19110401-V06-02-page3.txt: [('-', ''), ('-', ''), ('.-', '.')] LibM19110401-V06-02-page32.txt: [('Bap-', 'Bap'), ('Relig-', 'Relig')] LibM19110401-V06-02-page33.txt: [('-wow-', 'wow-')] LibM19110401-V06-02-page34.txt: [('-', ''), ('per-', 'per')] LibM19110401-V06-02-page40.txt: [('es-', 'es')] LibM19110401-V06-02-page42.txt: [('-', ''), ('-', ''), ('-.', '.'), ('.-', '.')] LibM19110401-V06-02-page43.txt: [('-', ''), ('-', ''), ('God.-', 'God.')] LibM19110401-V06-02-page46.txt: [('Conti-', 'Conti'), ('BUILD-', 'BUILD'), ('per-', 'per')] LibM19110401-V06-02-page47.txt: [('Globe-', 'Globe')] LibM19110401-V06-02-page48.txt: [('-', ''), ('--', '-')] LibM19110401-V06-02-page49.txt: [('-', ''), ('-', ''), ('-Lamer.', 'Lamer.')] LibM19110401-V06-02-page50.txt: [('-', ''), ('-', ''), ('-', '')] LibM19110401-V06-02-page52.txt: [('-o', 'o')] LibM19110701-V06-03-page1.txt: [('--', '-'), ('-', ''), ('-dkialligranli', 'dkialligranli')] LibM19110701-V06-03-page10.txt: [('-', '')] LibM19110701-V06-03-page12.txt: [('-', ''), ('-IT.', 'IT.'), ('right-', 'right')] LibM19110701-V06-03-page14.txt: [('un-', 'un')] LibM19110701-V06-03-page15.txt: [('-for', 'for')] LibM19110701-V06-03-page16.txt: [('establish-', 'establish')] LibM19110701-V06-03-page2.txt: [('-', ''), ('-', '')] LibM19110701-V06-03-page21.txt: [('Eng-', 'Eng')] LibM19110701-V06-03-page22.txt: [('peo-', 'peo')] LibM19110701-V06-03-page24.txt: [('-', ''), ('manufac-', 'manufac')] LibM19110701-V06-03-page25.txt: [('ter-', 'ter'), ('wor-', 'wor'), ('-', '')] LibM19110701-V06-03-page26.txt: [('.-', '.'), ('-', ''), ('.-', '.')] LibM19110701-V06-03-page27.txt: [('re-', 're')] LibM19110701-V06-03-page28.txt: [('audience-', 'audience')] LibM19110701-V06-03-page31.txt: [('ac-', 'ac')] LibM19110701-V06-03-page32.txt: [('Prot-', 'Prot'), ('re-', 're')] LibM19110701-V06-03-page33.txt: [('Sabbathkeep-', 'Sabbathkeep'), ('under-', 'under')] LibM19110701-V06-03-page35.txt: [('mem-', 'mem')] LibM19110701-V06-03-page37.txt: [('dissolu-', 'dissolu')] LibM19110701-V06-03-page39.txt: [('-', '')] LibM19110701-V06-03-page4.txt: [('-', '')] LibM19110701-V06-03-page41.txt: [('bar-', 'bar')] LibM19110701-V06-03-page42.txt: [('Ma-', 'Ma')] LibM19110701-V06-03-page45.txt: [('re-', 're')] LibM19110701-V06-03-page48.txt: [('-N', 'N')] LibM19110701-V06-03-page49.txt: [('-', ''), ('-', ''), ('treat-', 'treat')] LibM19110701-V06-03-page5.txt: [('re-', 're')] LibM19110701-V06-03-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('expe-', 'expe')] LibM19110701-V06-03-page52.txt: [('rea-', 'rea')] LibM19110701-V06-03-page9.txt: [('Post-', 'Post'), ('ob-', 'ob')] LibM19111001-V06-04-page1.txt: [('-', ''), ('-', '')] LibM19111001-V06-04-page11.txt: [('Latin-', 'Latin'), ('An-', 'An'), ('Con-', 'Con')] LibM19111001-V06-04-page12.txt: [('-', ''), ('guar-', 'guar')] LibM19111001-V06-04-page14.txt: [('Etats-', 'Etats'), ('-', ''), ('-', '')] LibM19111001-V06-04-page16.txt: [('----', '---'), ('AMER-', 'AMER')] LibM19111001-V06-04-page17.txt: [('rev-', 'rev')] LibM19111001-V06-04-page18.txt: [('o-', 'o'), ('ex-', 'ex')] LibM19111001-V06-04-page19.txt: [('-said', 'said'), ('legis-', 'legis')] LibM19111001-V06-04-page2.txt: [('-', ''), ('-', '')] LibM19111001-V06-04-page23.txt: [('ex-', 'ex')] LibM19111001-V06-04-page24.txt: [('-', ''), ('-.', '.'), ('.-', '.')] LibM19111001-V06-04-page26.txt: [('-', ''), ('con-', 'con')] LibM19111001-V06-04-page34.txt: [('-', ''), ('-', '')] LibM19111001-V06-04-page35.txt: [('argu-', 'argu')] LibM19111001-V06-04-page36.txt: [('CRUM-', 'CRUM')] LibM19111001-V06-04-page38.txt: [('-', ''), ('-', '')] LibM19111001-V06-04-page39.txt: [('-', ''), ('-', '')] LibM19111001-V06-04-page40.txt: [('-', '')] LibM19111001-V06-04-page42.txt: [('-', '')] LibM19111001-V06-04-page48.txt: [('-', '')] LibM19111001-V06-04-page49.txt: [('-', ''), ('-', ''), ('PDNIam-', 'PDNIam')] LibM19111001-V06-04-page5.txt: [('o-', 'o'), ('-', '')] LibM19111001-V06-04-page50.txt: [('-', ''), ('-', ''), ('-', '')] LibM19111001-V06-04-page52.txt: [('-li', 'li'), ('Ra-', 'Ra'), ('-li', 'li')] LibM19111001-V06-04-page8.txt: [('-', '')] LibM19120101-V07-01-page12.txt: [('-', ''), ('-', '')] LibM19120101-V07-01-page15.txt: [('assess-', 'assess'), ('com-', 'com')] LibM19120101-V07-01-page19.txt: [('-other', 'other')] LibM19120101-V07-01-page2.txt: [('-', ''), ('-', '')] LibM19120101-V07-01-page22.txt: [('com-', 'com')] LibM19120101-V07-01-page26.txt: [('Novem-', 'Novem')] LibM19120101-V07-01-page27.txt: [('Pan-', 'Pan')] LibM19120101-V07-01-page33.txt: [('-', '')] LibM19120101-V07-01-page37.txt: [('-', '')] LibM19120101-V07-01-page38.txt: [('Brigadier-', 'Brigadier'), ('fin-', 'fin')] LibM19120101-V07-01-page39.txt: [('-', '')] LibM19120101-V07-01-page42.txt: [('-', ''), ('-', ''), ('-', '')] LibM19120101-V07-01-page43.txt: [('-', ''), ('ESTAB-', 'ESTAB')] LibM19120101-V07-01-page45.txt: [('-', ''), ('-', '')] LibM19120101-V07-01-page46.txt: [('-', '')] LibM19120101-V07-01-page47.txt: [('-.', '.'), ('.-', '.')] LibM19120101-V07-01-page49.txt: [('devel-', 'devel'), ('-', ''), ('PM-', 'PM'), ('-', ''), ('p-', 'p')] LibM19120101-V07-01-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19120101-V07-01-page6.txt: [('-', '')] LibM19120101-V07-01-page7.txt: [('-', ''), ('Fairbanks-Roosevelt-', 'Fairbanks-Roosevelt')] LibM19120101-V07-01-page8.txt: [('-', ''), ('-', '')] LibM19120101-V07-01-page9.txt: [('be-', 'be')] LibM19120401-V07-02-page2.txt: [('-', '')] LibM19120401-V07-02-page21.txt: [('--', '-')] LibM19120401-V07-02-page25.txt: [('no-', 'no')] LibM19120401-V07-02-page26.txt: [('divi-', 'divi'), ('mat-', 'mat')] LibM19120401-V07-02-page27.txt: [('di-', 'di')] LibM19120401-V07-02-page29.txt: [('--', '-')] LibM19120401-V07-02-page30.txt: [('-', '')] LibM19120401-V07-02-page31.txt: [('un-', 'un')] LibM19120401-V07-02-page33.txt: [('un-', 'un'), ('be-', 'be')] LibM19120401-V07-02-page34.txt: [('un-', 'un')] LibM19120401-V07-02-page36.txt: [('Accord-', 'Accord')] LibM19120401-V07-02-page37.txt: [('-', '')] LibM19120401-V07-02-page38.txt: [('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-icx-m', 'icx-m'), ('Xl-td-', 'Xl-td'), ('ec-', 'ec'), ('-', ''), ('-', ''), ('-', ''), ('-mensisZ."\'"-', 'mensisZ."\'"-')] LibM19120401-V07-02-page4.txt: [('-...', '...')] LibM19120401-V07-02-page40.txt: [('-Sep-', 'Sep-'), ('-szera', 'szera'), ('-ilre', 'ilre')] LibM19120401-V07-02-page42.txt: [('discus-', 'discus'), ('-sion', 'sion')] LibM19120401-V07-02-page48.txt: [('-', ''), ('LIB-', 'LIB'), ('-', '')] LibM19120401-V07-02-page49.txt: [('-', '')] LibM19120401-V07-02-page51.txt: [('-Seven', 'Seven'), ('ar-', 'ar')] LibM19120401-V07-02-page6.txt: [('Cali-', 'Cali')] LibM19120401-V07-02-page7.txt: [('non-', 'non'), ('-', '')] LibM19120401-V07-02-page8.txt: [('for-', 'for')] LibM19120701-V07-03-page11.txt: [('be-', 'be')] LibM19120701-V07-03-page13.txt: [('anti-', 'anti'), ('-rotest', 'rotest'), ('hol-', 'hol')] LibM19120701-V07-03-page14.txt: [('-', ''), ('-', '')] LibM19120701-V07-03-page16.txt: [('Sec-', 'Sec')] LibM19120701-V07-03-page17.txt: [('-', ''), ('com-', 'com')] LibM19120701-V07-03-page18.txt: [('distinct-', 'distinct')] LibM19120701-V07-03-page2.txt: [('Co-', 'Co'), ('-', '')] LibM19120701-V07-03-page20.txt: [('-ss', 'ss')] LibM19120701-V07-03-page21.txt: [('-', ''), ('-', ''), ('estab-', 'estab')] LibM19120701-V07-03-page22.txt: [('re-', 're'), ('--', '-')] LibM19120701-V07-03-page25.txt: [('-', '')] LibM19120701-V07-03-page28.txt: [('unveil-', 'unveil')] LibM19120701-V07-03-page30.txt: [('-', '')] LibM19120701-V07-03-page37.txt: [('-', '')] LibM19120701-V07-03-page38.txt: [('con-', 'con'), ('AMEND-', 'AMEND'), ('-', '')] LibM19120701-V07-03-page4.txt: [('-ititeiltintonecfctration', 'ititeiltintonecfctration'), ('-', ''), ('-', ''), ('-', ''), ('e.n.d....-', 'e.n.d....'), ('i-', 'i'), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('s-', 's'), ('-', ''), ('-fr', 'fr'), ('ee-', 'ee'), ('-', ''), ('-', ''), ('-..-.', '..-.'), ('f--', 'f-'), ('otb-', 'otb'), ('......-', '......'), ('--', '-'), ('-', ''), ('-a', 'a'), ('-.', '.'), ('-e-', 'e-'), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('d-', 'd'), ('..---', '..--'), ('.i..-', '.i..'), ('..ta.--', '..ta.-'), ('.-', '.'), ('...-', '...'), ('-w', 'w'), ('x.t-', 'x.t'), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('..-', '..'), ('-', ''), ('.-', '.'), ('..g-Z-', '..g-Z'), ('---.', '--.'), ('--', '-'), ('-', ''), ("---'", "--'"), ('--ft', '-ft'), ('----a', '---a'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-sfo', 'sfo'), ('-', ''), ('....-', '....'), ('-', ''), ('a...-', 'a...'), ('-', ''), ('-.', '.'), ('-', ''), ('--', '-'), ("-.i'", ".i'"), ('N.-', 'N.'), ('m-', 'm'), ('-', ''), ('-', ''), ('-', ''), ('dfr.d.-', 'dfr.d.'), ('-e', 'e'), ('ap-', 'ap'), ('-.onia', '.onia'), ('-', ''), ('-', ''), ('-.', '.'), ('.-', '.'), ('--z', '-z'), ('-', ''), ('-', ''), ('-', ''), ('...-vr-', '...-vr'), ('-.', '.'), ('.-', '.'), ('-', ''), ('e-', 'e'), ('-', ''), ('-e', 'e'), ('-..', '..'), ('-A.c.....', 'A.c.....'), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-..g.', '..g.'), ('-.', '.'), ('g--', 'g-'), ('-', ''), ("--'", "-'"), ('-inio', 'inio'), ('-LI', 'LI'), ('-I', 'I'), ('-...', '...'), ('N.-', 'N.'), ('n-', 'n'), ('.ea...-', '.ea...'), ('-a', 'a'), ('-', ''), ('-', ''), ('-i-', 'i-'), ('-..a.A.', '..a.A.'), ('h---', 'h--'), ('.-', '.'), ('-', ''), ('--.r..', '-.r..'), ('.-', '.'), ('-.-', '.-'), ('-', ''), ('---', '--'), ("-'", "'"), ('---is.', '--is.'), ('-', ''), ('-r', 'r'), ('--Yelor.', '-Yelor.'), ('-.', '.'), ('-....-..C.', '....-..C.'), ('-', ''), ('-ir."...ezi..i..', 'ir."...ezi..i..'), ('-', ''), ('-', ''), ('e.e.-', 'e.e.'), ('-', ''), ('-', ''), ('..-', '..'), ('-', ''), ('-"', '"'), ('-', ''), ('-.', '.'), ('-', ''), ('....-', '....'), ('-', ''), ('-', ''), ('-', ''), ('"....-', '"....'), ('............nen-', '............nen'), ('--..z..', '-..z..'), ('I-', 'I'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('.-', '.'), ('--', '-'), ('-', ''), ('--Ve', '-Ve'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-a-', 'a-'), ('-', ''), ('.-', '.'), ('-a.Cdr', 'a.Cdr'), ('-', ''), ('eartc-', 'eartc'), ('--', '-'), ('-.-', '.-'), ('..-', '..'), ('-..-.....', '..-.....'), ('-', ''), ('....-', '....'), ('-', ''), ('.--', '.-'), ('--.....', '-.....'), ('-', ''), ('.-', '.'), ("-'..", "'.."), ('-', ''), ('-', ''), ('-', ''), ('-riK-', 'riK-'), ('-', ''), ('-.', '.'), ('--', '-'), ('---', '--'), ('--', '-'), ('-', ''), ('-', ''), ('-.', '.'), ('--r', '-r'), ('.--', '.-'), ('-', ''), ('-...-', '...-')] LibM19120701-V07-03-page42.txt: [('-', '')] LibM19120701-V07-03-page43.txt: [('fun-', 'fun'), ('ap-', 'ap')] LibM19120701-V07-03-page46.txt: [('-', ''), ('-', ''), ('.-', '.')] LibM19120701-V07-03-page47.txt: [('-', '')] LibM19120701-V07-03-page48.txt: [('-', ''), ('-', '')] LibM19120701-V07-03-page49.txt: [('-', ''), ('gentle-', 'gentle')] LibM19120701-V07-03-page5.txt: [('-', '')] LibM19120701-V07-03-page51.txt: [('Gov-', 'Gov'), ('dis-', 'dis')] LibM19120701-V07-03-page52.txt: [('-.', '.'), ('re-', 're'), ('-', ''), ('We-', 'We'), ('-', '')] LibM19120701-V07-03-page9.txt: [('Pan-', 'Pan'), ('November-', 'November'), ('observ-', 'observ')] LibM19121001-V07-04-page13.txt: [('-', '')] LibM19121001-V07-04-page14.txt: [('non-', 'non')] LibM19121001-V07-04-page15.txt: [('-', ''), ('Postmaster-', 'Postmaster')] LibM19121001-V07-04-page17.txt: [('-', '')] LibM19121001-V07-04-page19.txt: [('-', '')] LibM19121001-V07-04-page2.txt: [('.-', '.'), ('-', ''), ('-', ''), ('Steph-', 'Steph')] LibM19121001-V07-04-page20.txt: [('mat-', 'mat')] LibM19121001-V07-04-page21.txt: [('Cath-', 'Cath')] LibM19121001-V07-04-page23.txt: [('-the', 'the')] LibM19121001-V07-04-page29.txt: [('deter-', 'deter'), ('-', ''), ('constru-', 'constru'), ('spe-', 'spe')] LibM19121001-V07-04-page3.txt: [('-', '')] LibM19121001-V07-04-page31.txt: [('-', '')] LibM19121001-V07-04-page32.txt: [('-', '')] LibM19121001-V07-04-page41.txt: [('seek-', 'seek')] LibM19121001-V07-04-page44.txt: [('relation-', 'relation')] LibM19121001-V07-04-page5.txt: [('ad-', 'ad')] LibM19121001-V07-04-page50.txt: [('-..', '..')] LibM19121001-V07-04-page51.txt: [('-', ''), ('-', ''), ('Answers-', 'Answers')] LibM19121001-V07-04-page6.txt: [('Orion-', 'Orion'), ('.-', '.'), ('.raityr-', '.raityr'), ('neer.-', 'neer.'), ('V-', 'V'), ('mow-', 'mow')] LibM19121001-V07-04-page7.txt: [('.-', '.'), ('-', ''), ('-', ''), ('-', '')] LibM19121001-V07-04-page9.txt: [('-', ''), ('ma-', 'ma')] LibM19130101-V08-01-page10.txt: [('think-', 'think')] LibM19130101-V08-01-page11.txt: [('considera-', 'considera')] LibM19130101-V08-01-page14.txt: [('vigor-', 'vigor')] LibM19130101-V08-01-page15.txt: [('re-', 're')] LibM19130101-V08-01-page17.txt: [('---', '--')] LibM19130101-V08-01-page2.txt: [('Co-', 'Co'), ('lhan-', 'lhan'), ('-', ''), ('MitaM.O.D.mroo.M.O.m.-', 'MitaM.O.D.mroo.M.O.m.')] LibM19130101-V08-01-page22.txt: [('pro-', 'pro')] LibM19130101-V08-01-page24.txt: [('-', ''), ('LIB-', 'LIB')] LibM19130101-V08-01-page25.txt: [('in-', 'in')] LibM19130101-V08-01-page26.txt: [('inves-', 'inves')] LibM19130101-V08-01-page27.txt: [('det-', 'det')] LibM19130101-V08-01-page3.txt: [('-', ''), ('pre-', 'pre'), ('there-', 'there'), ('un-', 'un'), ('-', ''), ('SECOND.-', 'SECOND.'), ('pur-', 'pur'), ('FIRST.-', 'FIRST.'), ('ad-', 'ad')] LibM19130101-V08-01-page31.txt: [('Atlas-', 'Atlas'), ('individ-', 'individ')] LibM19130101-V08-01-page32.txt: [('-', ''), ('-', '')] LibM19130101-V08-01-page35.txt: [('-', '')] LibM19130101-V08-01-page37.txt: [('-', '')] LibM19130101-V08-01-page4.txt: [('maga-', 'maga')] LibM19130101-V08-01-page40.txt: [('-', ''), ('state-', 'state')] LibM19130101-V08-01-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('freight-', 'freight'), ('ordi-', 'ordi'), ('-went', 'went'), ('-', ''), ('-', '')] LibM19130101-V08-01-page43.txt: [('-', ''), ('-', ''), ('-bridges', 'bridges'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('x-', 'x'), ('-', ''), ('-', ''), ('-', ''), ('-Io', 'Io'), ('-', ''), ('-', '')] LibM19130101-V08-01-page44.txt: [('work-', 'work'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('serv-', 'serv')] LibM19130101-V08-01-page45.txt: [('-', ''), ('xo-', 'xo'), ('i-', 'i'), ('-', ''), ('-', ''), ('x-', 'x'), ('s-', 's'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19130101-V08-01-page46.txt: [('-', '')] LibM19130101-V08-01-page5.txt: [('ad-', 'ad'), ('-', '')] LibM19130101-V08-01-page50.txt: [('An-', 'An'), ('Ar-', 'Ar'), ('-AMERICAN', 'AMERICAN'), ('Re-', 'Re'), ('So-', 'So'), ('-', ''), ('POST-', 'POST')] LibM19130101-V08-01-page51.txt: [('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('r---------', 'r--------'), ('-Nr', 'Nr'), ('-', ''), ('-', ''), ('c-', 'c')] LibM19130101-V08-01-page52.txt: [('-page', 'page')] LibM19130101-V08-01-page9.txt: [('Na-', 'Na')] LibM19130401-V08-02-page12.txt: [('min-', 'min')] LibM19130401-V08-02-page14.txt: [('Co-', 'Co')] LibM19130401-V08-02-page2.txt: [('-earoominmerk', 'earoominmerk'), ('al-', 'al'), ('-', ''), ('affil-', 'affil')] LibM19130401-V08-02-page21.txt: [('-', '')] LibM19130401-V08-02-page22.txt: [('-is', 'is')] LibM19130401-V08-02-page24.txt: [('-', ''), ('-', ''), ('pro-', 'pro')] LibM19130401-V08-02-page25.txt: [('-', '')] LibM19130401-V08-02-page27.txt: [('rea-', 'rea')] LibM19130401-V08-02-page28.txt: [('-', '')] LibM19130401-V08-02-page3.txt: [('ad-', 'ad'), ('pur-', 'pur'), ('CITI-', 'CITI'), ('PRE-', 'PRE'), ('Strug-', 'Strug'), ('CHOOS-', 'CHOOS'), ('enjoy-', 'enjoy'), ('PRIN-', 'PRIN'), ('sub-', 'sub'), ('whole-', 'whole')] LibM19130401-V08-02-page30.txt: [('.ex-', '.ex'), ('-', ''), ('Philadel-', 'Philadel'), ('reso-', 'reso'), ('Scot-', 'Scot'), ('visit-', 'visit'), ('set-', 'set'), ('his-', 'his'), ('re-', 're'), ('hu-', 'hu'), ('con-', 'con')] LibM19130401-V08-02-page34.txt: [('hav-', 'hav'), ('cer-', 'cer'), ('un-', 'un'), ('ac-', 'ac'), ('ad-', 'ad'), ('maintain-', 'maintain')] LibM19130401-V08-02-page39.txt: [('mil-', 'mil')] LibM19130401-V08-02-page4.txt: [('recom-', 'recom')] LibM19130401-V08-02-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-shops.', 'shops.')] LibM19130401-V08-02-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('subse-', 'subse'), ('o-', 'o'), ('sub-', 'sub'), ('loo-', 'loo'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19130401-V08-02-page44.txt: [('-', ''), ('-', ''), ('-', ''), ('-r', 'r'), ('Sat-', 'Sat'), ('-', ''), ('-a', 'a'), ('-a', 'a')] LibM19130401-V08-02-page45.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19130401-V08-02-page46.txt: [('-', ''), ('o-', 'o'), ('begin-', 'begin')] LibM19130401-V08-02-page47.txt: [('-', '')] LibM19130401-V08-02-page49.txt: [('Albu-', 'Albu')] LibM19130401-V08-02-page5.txt: [('ad-', 'ad')] LibM19130401-V08-02-page50.txt: [('Ar-', 'Ar'), ('An-', 'An'), ('Re-', 'Re'), ('-', ''), ('POST-', 'POST')] LibM19130401-V08-02-page51.txt: [('-', ''), ('----', '---'), ('--', '-'), ('-----', '----'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.-', '.-'), ('-.-.', '.-.'), ('.-', '.'), ('.-', '.'), ('---"', '--"'), ('--.', '-.'), ('-Mt', 'Mt'), ('-', ''), ('JUSTI-', 'JUSTI')] LibM19130401-V08-02-page52.txt: [('-page', 'page')] LibM19130701-V08-03-page10.txt: [('Mc-', 'Mc'), ('Re-', 'Re')] LibM19130701-V08-03-page14.txt: [('-T.', 'T.')] LibM19130701-V08-03-page17.txt: [('al-', 'al')] LibM19130701-V08-03-page18.txt: [('exer-', 'exer')] LibM19130701-V08-03-page2.txt: [('Seen-p.deffeatv-', 'Seen-p.deffeatv'), ('-eury.', 'eury.'), ('-eiteile', 'eiteile'), ('rhah-', 'rhah'), ('-', ''), ('-eeedie', 'eeedie'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-Yezaedi', 'Yezaedi'), ('-eiraeznactmew', 'eiraeznactmew'), ('-erga-evia', 'erga-evia'), ('-W', 'W'), ('-e', 'e'), ('--elt', '-elt'), ('-e', 'e'), ('MgetOofm-', 'MgetOofm'), ('SaFVtel-', 'SaFVtel'), ('-ix', 'ix')] LibM19130701-V08-03-page21.txt: [('sena-', 'sena')] LibM19130701-V08-03-page22.txt: [('corn-', 'corn')] LibM19130701-V08-03-page26.txt: [('-', ''), ('-', ''), ("-'", "'"), ('-', ''), ('-', '')] LibM19130701-V08-03-page27.txt: [('-.', '.'), ('...-', '...'), ('-', ''), ('.......--', '.......-'), ('-', ''), ('-....', '....'), ('-"..r...', '"..r...'), ('-', ''), ('-', ''), ('-', ''), ('-.-', '.-'), ('-.', '.'), ('-.', '.'), ('-', ''), ('-...', '...'), ('----.--', '---.--'), ('-.........', '.........'), ('-........"', '........"'), ('-', ''), ('-', ''), ('-', '')] LibM19130701-V08-03-page29.txt: [('Sun-', 'Sun'), ('restric-', 'restric'), ('re-', 're')] LibM19130701-V08-03-page3.txt: [('--HE', '-HE'), ('CITIZEN-', 'CITIZEN'), ('CHOOS-', 'CHOOS'), ('enjoy-', 'enjoy'), ('PRIN-', 'PRIN')] LibM19130701-V08-03-page30.txt: [('exer-', 'exer')] LibM19130701-V08-03-page32.txt: [('in-', 'in')] LibM19130701-V08-03-page33.txt: [('pub-', 'pub')] LibM19130701-V08-03-page36.txt: [('con-', 'con')] LibM19130701-V08-03-page39.txt: [('hear-', 'hear'), ('Commis-', 'Commis'), ('Sun-', 'Sun'), ('move-', 'move'), ('Chris-', 'Chris')] LibM19130701-V08-03-page4.txt: [('-', '')] LibM19130701-V08-03-page41.txt: [('re-', 're')] LibM19130701-V08-03-page42.txt: [('GOV-', 'GOV')] LibM19130701-V08-03-page44.txt: [('-t', 't'), ('-', ''), ('cd-n-', 'cd-n'), ('-ca.z', 'ca.z'), ('zW-', 'zW'), ('A-', 'A')] LibM19130701-V08-03-page49.txt: [('-', ''), ('-', ''), ('-ICIT', 'ICIT'), ('ADVER-', 'ADVER'), ('-', ''), ('-', ''), ('-eX', 'eX')] LibM19130701-V08-03-page5.txt: [('-', ''), ('ad-', 'ad')] LibM19130701-V08-03-page50.txt: [('An-', 'An')] LibM19130701-V08-03-page51.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19130701-V08-03-page6.txt: [('-', ''), ('--', '-')] LibM19130701-V08-03-page8.txt: [('command-', 'command')] LibM19130701-V08-03-page9.txt: [('Mc-', 'Mc'), ('Mc-', 'Mc')] LibM19131001-V08-04-page10.txt: [('un-', 'un')] LibM19131001-V08-04-page11.txt: [('state-estab-', 'state-estab')] LibM19131001-V08-04-page12.txt: [('-', '')] LibM19131001-V08-04-page13.txt: [('Sun-', 'Sun'), ('with-', 'with'), ('extrav-', 'extrav'), ('preseri-', 'preseri')] LibM19131001-V08-04-page14.txt: [('Babylo-', 'Babylo')] LibM19131001-V08-04-page18.txt: [('--', '-')] LibM19131001-V08-04-page2.txt: [('-', '')] LibM19131001-V08-04-page20.txt: [('-', '')] LibM19131001-V08-04-page22.txt: [('mo-', 'mo')] LibM19131001-V08-04-page25.txt: [('ex-', 'ex'), ('ex-', 'ex'), ('Sun-', 'Sun'), ('ex-', 'ex')] LibM19131001-V08-04-page26.txt: [('pre-', 'pre')] LibM19131001-V08-04-page28.txt: [('-uncontrolled', 'uncontrolled')] LibM19131001-V08-04-page29.txt: [('in-', 'in')] LibM19131001-V08-04-page3.txt: [('CHOOS-', 'CHOOS'), ('PRIN-', 'PRIN'), ('enjoy-', 'enjoy'), ('intol-', 'intol'), ('sub-', 'sub'), ('whole-', 'whole'), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('ad-', 'ad'), ("'q-", "'q")] LibM19131001-V08-04-page30.txt: [('com-', 'com')] LibM19131001-V08-04-page31.txt: [('Re-', 'Re')] LibM19131001-V08-04-page33.txt: [('ab-', 'ab')] LibM19131001-V08-04-page34.txt: [('uni-', 'uni'), ('prog-', 'prog')] LibM19131001-V08-04-page36.txt: [('-', ''), ('-', ''), ('-', ''), ('relig-', 'relig')] LibM19131001-V08-04-page39.txt: [('--', '-')] LibM19131001-V08-04-page4.txt: [('-', ''), ('-', ''), ('-o', 'o')] LibM19131001-V08-04-page41.txt: [('-', '')] LibM19131001-V08-04-page43.txt: [('-questions', 'questions')] LibM19131001-V08-04-page44.txt: [('govern-', 'govern')] LibM19131001-V08-04-page45.txt: [('D-', 'D'), ('-', '')] LibM19131001-V08-04-page46.txt: [('-is', 'is')] LibM19131001-V08-04-page49.txt: [('ADVER-', 'ADVER')] LibM19131001-V08-04-page5.txt: [('ad-', 'ad')] LibM19131001-V08-04-page50.txt: [('-', ''), ('-', ''), ('Ar-', 'Ar')] LibM19131001-V08-04-page51.txt: [('mission-', 'mission')] LibM19131001-V08-04-page52.txt: [('--', '-'), ('-', ''), ("'.-", "'."), ('-', ''), ('-', ''), ('-.IA', '.IA')] LibM19131001-V08-04-page7.txt: [('-MMI.', 'MMI.'), ('M.-', 'M.'), ('-MED.', 'MED.'), ('-', ''), ('-rthe', 'rthe')] LibM19140101-V09-01-page1.txt: [('-', '')] LibM19140101-V09-01-page11.txt: [('-MWOO', 'MWOO'), ('-', '')] LibM19140101-V09-01-page18.txt: [('-I', 'I'), ('-', ''), ('-from', 'from'), ('prin-', 'prin')] LibM19140101-V09-01-page19.txt: [('cler-', 'cler'), ('-that', 'that')] LibM19140101-V09-01-page2.txt: [('-mm.', 'mm.'), ('-', '')] LibM19140101-V09-01-page21.txt: [('-all', 'all')] LibM19140101-V09-01-page23.txt: [('-entered', 'entered'), ('heaven-', 'heaven'), ('-', ''), ('govern-', 'govern'), ('syn-', 'syn'), ('be-', 'be'), ('-result', 'result'), ('with-', 'with')] LibM19140101-V09-01-page25.txt: [('-rights', 'rights')] LibM19140101-V09-01-page26.txt: [('-our', 'our'), ('con-', 'con')] LibM19140101-V09-01-page27.txt: [('-for', 'for')] LibM19140101-V09-01-page29.txt: [('-', ''), ('-Sabbath', 'Sabbath'), ('un-', 'un')] LibM19140101-V09-01-page3.txt: [('LIBER-', 'LIBER'), ('-inch', 'inch'), ('CHANG-', 'CHANG'), ('CARE-', 'CARE')] LibM19140101-V09-01-page30.txt: [('king--', 'king-'), ('-', '')] LibM19140101-V09-01-page31.txt: [('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', '')] LibM19140101-V09-01-page33.txt: [('-are', 'are')] LibM19140101-V09-01-page36.txt: [('-', '')] LibM19140101-V09-01-page38.txt: [('Chris-', 'Chris'), ('com-', 'com'), ('Re-', 'Re'), ('-', ''), ('-', '')] LibM19140101-V09-01-page43.txt: [('prob-', 'prob')] LibM19140101-V09-01-page44.txt: [('BUILD-', 'BUILD')] LibM19140101-V09-01-page46.txt: [('say-', 'say'), ('an--', 'an-')] LibM19140101-V09-01-page47.txt: [('-', '')] LibM19140101-V09-01-page48.txt: [('citi-', 'citi')] LibM19140101-V09-01-page52.txt: [('-.', '.')] LibM19140101-V09-01-page53.txt: [('-', ''), ('e.A-', 'e.A'), ('Ar-', 'Ar'), ('An-', 'An')] LibM19140101-V09-01-page54.txt: [('-i', 'i'), ('Albu-', 'Albu'), ('Aven-', 'Aven')] LibM19140101-V09-01-page55.txt: [('-', ''), ('-.', '.')] LibM19140101-V09-01-page56.txt: [('-VoPr', 'VoPr'), ('-', ''), ('-NA', 'NA'), ('-.N', '.N')] LibM19140101-V09-01-page8.txt: [('-', '')] LibM19140101-V09-01-page9.txt: [('ad-', 'ad')] LibM19140401-V09-02-page1.txt: [('--gm', '-gm')] LibM19140401-V09-02-page11.txt: [('-', ''), ('be-', 'be')] LibM19140401-V09-02-page12.txt: [('al-', 'al'), ('combina-', 'combina'), ('coun-', 'coun'), ('un-', 'un')] LibM19140401-V09-02-page13.txt: [('-object', 'object'), ('.-', '.')] LibM19140401-V09-02-page14.txt: [('-intolerant', 'intolerant'), ('prod-', 'prod')] LibM19140401-V09-02-page15.txt: [('Sun-', 'Sun')] LibM19140401-V09-02-page16.txt: [('-', '')] LibM19140401-V09-02-page17.txt: [('-', ''), ('ob-', 'ob')] LibM19140401-V09-02-page18.txt: [('ASSEM-', 'ASSEM')] LibM19140401-V09-02-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('af-', 'af'), ('-', '')] LibM19140401-V09-02-page22.txt: [('Con-', 'Con')] LibM19140401-V09-02-page25.txt: [('-', ''), ('-', ''), ('citi-', 'citi'), ('stat-', 'stat')] LibM19140401-V09-02-page26.txt: [('prop-', 'prop')] LibM19140401-V09-02-page27.txt: [('funda-', 'funda'), ('-', '')] LibM19140401-V09-02-page29.txt: [('-', '')] LibM19140401-V09-02-page3.txt: [('CIRCULAT-', 'CIRCULAT')] LibM19140401-V09-02-page30.txt: [('forty-', 'forty')] LibM19140401-V09-02-page32.txt: [('-', '')] LibM19140401-V09-02-page33.txt: [('en-', 'en')] LibM19140401-V09-02-page35.txt: [('PROTES-', 'PROTES')] LibM19140401-V09-02-page36.txt: [('in-', 'in')] LibM19140401-V09-02-page38.txt: [('-', '')] LibM19140401-V09-02-page41.txt: [('MAGA-', 'MAGA')] LibM19140401-V09-02-page43.txt: [('BE-', 'BE'), ('-', '')] LibM19140401-V09-02-page44.txt: [('A-i-', 'A-i'), ('.-', '.'), ('PARTNER-', 'PARTNER')] LibM19140401-V09-02-page46.txt: [('-', '')] LibM19140401-V09-02-page48.txt: [('-.', '.')] LibM19140401-V09-02-page49.txt: [('---.', '--.'), ('---il', '--il')] LibM19140401-V09-02-page5.txt: [('ad-', 'ad')] LibM19140401-V09-02-page50.txt: [('ADVER-', 'ADVER')] LibM19140401-V09-02-page52.txt: [('dan-', 'dan'), ('stern-', 'stern'), ('in-', 'in'), ('re-', 're'), ('-', ''), ('-', '')] LibM19140401-V09-02-page6.txt: [('-', '')] LibM19140401-V09-02-page7.txt: [('-', ''), ('MWO-', 'MWO'), ('MOD-', 'MOD'), ('glo-', 'glo')] LibM19140701-V09-03-page10.txt: [('sacra-', 'sacra'), ('-Surely', 'Surely'), ('op-', 'op')] LibM19140701-V09-03-page11.txt: [('estab-', 'estab')] LibM19140701-V09-03-page12.txt: [('--', '-')] LibM19140701-V09-03-page15.txt: [('transi-', 'transi')] LibM19140701-V09-03-page17.txt: [('sub-', 'sub'), ('re-', 're')] LibM19140701-V09-03-page2.txt: [('-', '')] LibM19140701-V09-03-page24.txt: [('-', ''), ('-', '')] LibM19140701-V09-03-page27.txt: [('-ence', 'ence')] LibM19140701-V09-03-page29.txt: [('-', '')] LibM19140701-V09-03-page3.txt: [('-', ''), ('-', ''), ('CIRCULAT-', 'CIRCULAT')] LibM19140701-V09-03-page31.txt: [('free-', 'free')] LibM19140701-V09-03-page33.txt: [('se-', 'se')] LibM19140701-V09-03-page34.txt: [('prop-', 'prop'), ('-', ''), ('ambi-', 'ambi')] LibM19140701-V09-03-page35.txt: [('-', '')] LibM19140701-V09-03-page36.txt: [('-', ''), ('rea-', 'rea'), ('Chris-', 'Chris')] LibM19140701-V09-03-page39.txt: [('-', ''), ('boy-', 'boy')] LibM19140701-V09-03-page4.txt: [('magazine-', 'magazine'), ('-', '')] LibM19140701-V09-03-page40.txt: [('itsfunda-', 'itsfunda'), ('-theft', 'theft')] LibM19140701-V09-03-page42.txt: [('-Most', 'Most'), ('-', '')] LibM19140701-V09-03-page44.txt: [('prohibit-', 'prohibit')] LibM19140701-V09-03-page48.txt: [('-.', '.')] LibM19140701-V09-03-page49.txt: [('-', ''), ('k-', 'k'), ('-i..', 'i..'), ('arwl-A-', 'arwl-A'), ('-', ''), ('-"', '"'), ("'-", "'"), ('-', '')] LibM19140701-V09-03-page5.txt: [('-', ''), ('ad-', 'ad')] LibM19140701-V09-03-page51.txt: [('V-', 'V'), ('-The', 'The'), ('."-', '."')] LibM19140701-V09-03-page7.txt: [('be-', 'be')] LibM19141001-V09-04-page10.txt: [('-', ''), ('-', ''), ('fail-', 'fail')] LibM19141001-V09-04-page11.txt: [('-', ''), ('-the', 'the')] LibM19141001-V09-04-page13.txt: [('bless-', 'bless'), ('re-', 're')] LibM19141001-V09-04-page14.txt: [('re-', 're')] LibM19141001-V09-04-page18.txt: [('Robes-', 'Robes'), ('be-', 'be')] LibM19141001-V09-04-page19.txt: [("-law.'", "law.'")] LibM19141001-V09-04-page2.txt: [('-', ''), ('-', '')] LibM19141001-V09-04-page22.txt: [('penal-', 'penal')] LibM19141001-V09-04-page26.txt: [('TI-', 'TI')] LibM19141001-V09-04-page27.txt: [('en-', 'en'), ('say-', 'say')] LibM19141001-V09-04-page29.txt: [('Medo-', 'Medo'), ('es-', 'es')] LibM19141001-V09-04-page30.txt: [('-in', 'in'), ('-note', 'note')] LibM19141001-V09-04-page31.txt: [('AMERI-', 'AMERI')] LibM19141001-V09-04-page33.txt: [('an-', 'an'), ('-', ''), ('Star-', 'Star'), ('-', ''), ('-', '')] LibM19141001-V09-04-page34.txt: [('-', ''), ('STAR-', 'STAR')] LibM19141001-V09-04-page35.txt: [('-', ''), ('rz-', 'rz'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('.--', '.-'), ('-rs.', 'rs.'), ('-', ''), ('-', ''), ('.-', '.'), ('-', '')] LibM19141001-V09-04-page36.txt: [('espe--', 'espe-'), ('--', '-')] LibM19141001-V09-04-page37.txt: [('wor-', 'wor')] LibM19141001-V09-04-page38.txt: [('es-', 'es')] LibM19141001-V09-04-page39.txt: [('-', '')] LibM19141001-V09-04-page4.txt: [('ad-', 'ad'), ('M-', 'M')] LibM19141001-V09-04-page42.txt: [('op-', 'op')] LibM19141001-V09-04-page43.txt: [('here.-', 'here.'), ("'-", "'")] LibM19141001-V09-04-page44.txt: [('away.-', 'away.')] LibM19141001-V09-04-page46.txt: [('-', ''), ('"-', '"')] LibM19141001-V09-04-page48.txt: [('-', '')] LibM19141001-V09-04-page49.txt: [('-', '')] LibM19141001-V09-04-page50.txt: [('Twenty-', 'Twenty'), ('-.-', '.-'), ('Mili-', 'Mili'), ('Hala-', 'Hala'), ('-.', '.'), ('At-', 'At'), ('Lan-', 'Lan'), ('-rli', 'rli'), ('Tram-', 'Tram'), ('J-', 'J'), ('Pe-', 'Pe'), ('Albu-', 'Albu'), ('LI-', 'LI'), ('Bloom-', 'Bloom'), ('--', '-'), ('-.', '.'), ('-', ''), ('-', '')] LibM19141001-V09-04-page51.txt: [('-', ''), ('--', '-'), ('.f------', '.f-----'), ('-----', '----'), ('-TESTINC', 'TESTINC'), ('-i', 'i'), ('monarchi-', 'monarchi'), ('Con-', 'Con'), ('-', '')] LibM19141001-V09-04-page52.txt: [('DR.A-', 'DR.A'), ('-', ''), ('-.', '.')] LibM19141001-V09-04-page7.txt: [('-', ''), ('-', ''), ('-.MM', '.MM')] LibM19141001-V09-04-page8.txt: [('-', ''), ('na-', 'na'), ('Har-', 'Har'), ('-', '')] LibM19141001-V09-04-page9.txt: [('-', ''), ('-', '')] LibM19150101-V10-01-page10.txt: [('-', '')] LibM19150101-V10-01-page11.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19150101-V10-01-page13.txt: [('Con-', 'Con')] LibM19150101-V10-01-page15.txt: [('re-', 're')] LibM19150101-V10-01-page16.txt: [('-', ''), ('-i', 'i'), ('---', '--')] LibM19150101-V10-01-page17.txt: [('discrimi-', 'discrimi')] LibM19150101-V10-01-page18.txt: [('-', '')] LibM19150101-V10-01-page2.txt: [('pre-', 'pre'), ('-', ''), ('affil-', 'affil')] LibM19150101-V10-01-page20.txt: [('-', '')] LibM19150101-V10-01-page25.txt: [('-io.', 'io.'), ('-', ''), ('-', ''), ('destruc-', 'destruc')] LibM19150101-V10-01-page26.txt: [('-', ''), ('-', '')] LibM19150101-V10-01-page27.txt: [('declared-', 'declared')] LibM19150101-V10-01-page28.txt: [('Sat-', 'Sat')] LibM19150101-V10-01-page29.txt: [('viola-', 'viola')] LibM19150101-V10-01-page3.txt: [('magae-', 'magae'), ('SUBSCRIP-', 'SUBSCRIP')] LibM19150101-V10-01-page30.txt: [('-I', 'I')] LibM19150101-V10-01-page31.txt: [('prohibit-', 'prohibit')] LibM19150101-V10-01-page34.txt: [('Star-', 'Star')] LibM19150101-V10-01-page35.txt: [('cathe-', 'cathe')] LibM19150101-V10-01-page36.txt: [('of-', 'of')] LibM19150101-V10-01-page38.txt: [('fol-', 'fol')] LibM19150101-V10-01-page39.txt: [('-The', 'The')] LibM19150101-V10-01-page41.txt: [('A-', 'A'), ('-', '')] LibM19150101-V10-01-page42.txt: [('to-', 'to')] LibM19150101-V10-01-page45.txt: [('-', '')] LibM19150101-V10-01-page46.txt: [('-', '')] LibM19150101-V10-01-page47.txt: [('or-', 'or')] LibM19150101-V10-01-page48.txt: [('Alco-', 'Alco')] LibM19150101-V10-01-page50.txt: [('Sunday.-', 'Sunday.'), ('-', ''), ('-sorrow', 'sorrow'), ('-', ''), ('-', ''), ('an-', 'an')] LibM19150101-V10-01-page51.txt: [('Ti-', 'Ti'), ('-', '')] LibM19150101-V10-01-page52.txt: [('Mill-', 'Mill'), ('Rap-', 'Rap'), ('.mmmmEiv-', '.mmmmEiv'), ('Trum-', 'Trum'), ('Pe-', 'Pe'), ('Lan-', 'Lan'), ('Luck-', 'Luck'), ('Alba-', 'Alba'), ('Aven-', 'Aven'), ('Bloom-', 'Bloom'), ('-', '')] LibM19150101-V10-01-page53.txt: [('-', ''), ('FREE-', 'FREE'), ('-', '')] LibM19150101-V10-01-page8.txt: [('-', '')] LibM19150401-V10-02-page11.txt: [('intro-', 'intro')] LibM19150401-V10-02-page12.txt: [('litho-', 'litho'), ('Corn-', 'Corn')] LibM19150401-V10-02-page14.txt: [('Postmaster-', 'Postmaster'), ('pam-', 'pam')] LibM19150401-V10-02-page15.txt: [('Postmaster-', 'Postmaster')] LibM19150401-V10-02-page17.txt: [('-', ''), ('-legislation.', 'legislation.'), ('un-', 'un'), ('pub-', 'pub')] LibM19150401-V10-02-page18.txt: [('of-', 'of'), ('re-', 're')] LibM19150401-V10-02-page19.txt: [('Mc-', 'Mc')] LibM19150401-V10-02-page2.txt: [('-', '')] LibM19150401-V10-02-page21.txt: [('RE-', 'RE')] LibM19150401-V10-02-page23.txt: [('free-', 'free'), ('WASH-', 'WASH'), ('reli-', 'reli')] LibM19150401-V10-02-page25.txt: [('WASH-', 'WASH'), ('Postmaster-', 'Postmaster'), ('de-', 'de'), ('WASH-', 'WASH')] LibM19150401-V10-02-page26.txt: [('Postmaster-', 'Postmaster')] LibM19150401-V10-02-page27.txt: [('Cath-', 'Cath')] LibM19150401-V10-02-page28.txt: [('CAP-', 'CAP')] LibM19150401-V10-02-page29.txt: [('per-', 'per')] LibM19150401-V10-02-page3.txt: [('cer-', 'cer'), ('Hear-', 'Hear')] LibM19150401-V10-02-page30.txt: [('or-', 'or')] LibM19150401-V10-02-page32.txt: [('gen-', 'gen')] LibM19150401-V10-02-page36.txt: [('Lot-', 'Lot')] LibM19150401-V10-02-page38.txt: [('-the', 'the')] LibM19150401-V10-02-page4.txt: [('.-', '.')] LibM19150401-V10-02-page41.txt: [('-the', 'the')] LibM19150401-V10-02-page43.txt: [('.-', '.')] LibM19150401-V10-02-page44.txt: [('un-', 'un'), ('-', '')] LibM19150401-V10-02-page46.txt: [('pre-', 'pre'), ('sub-', 'sub'), ("'O-", "'O")] LibM19150401-V10-02-page48.txt: [('-', ''), ('sa-', 'sa'), ('busi-', 'busi'), ('hence-', 'hence'), ('-', '')] LibM19150401-V10-02-page49.txt: [('-', ''), ('--', '-'), ('HUN-', 'HUN')] LibM19150401-V10-02-page5.txt: [('Philip-', 'Philip')] LibM19150401-V10-02-page50.txt: [('-', ''), ('-', ''), ('.-', '.'), ('-M.', 'M.'), ('.-', '.'), ('signifi-', 'signifi'), ('-', ''), ('-', '')] LibM19150401-V10-02-page51.txt: [('.--', '.-'), ('-', ''), ('.-', '.'), ('-PER', 'PER')] LibM19150401-V10-02-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('iiimm--', 'iiimm-'), ('---', '--'), ('Ad-', 'Ad')] LibM19150401-V10-02-page6.txt: [('---', '--'), ('kc-', 'kc'), ('-', ''), ('PEACE-', 'PEACE'), ('ASSEMB-', 'ASSEMB'), ('-', ''), ('-', ''), ('lost.-', 'lost.'), ('ri-', 'ri'), ('-K', 'K')] LibM19150401-V10-02-page7.txt: [('-', ''), ('-', '')] LibM19150401-V10-02-page9.txt: [('-', '')] LibM19150701-V10-03-page1.txt: [('--', '-')] LibM19150701-V10-03-page10.txt: [('-', '')] LibM19150701-V10-03-page14.txt: [('be-', 'be')] LibM19150701-V10-03-page15.txt: [('-', '')] LibM19150701-V10-03-page17.txt: [('-', '')] LibM19150701-V10-03-page2.txt: [('inter-', 'inter'), ('Col-', 'Col'), ('affil-', 'affil'), ('affili-', 'affili'), ('Massa-', 'Massa'), ('Connecti-', 'Connecti')] LibM19150701-V10-03-page21.txt: [('-', '')] LibM19150701-V10-03-page22.txt: [('-', ''), ('Chris-', 'Chris'), ('Eng-', 'Eng')] LibM19150701-V10-03-page25.txt: [('plot-', 'plot')] LibM19150701-V10-03-page26.txt: [('meth-', 'meth')] LibM19150701-V10-03-page27.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19150701-V10-03-page28.txt: [('degen-', 'degen')] LibM19150701-V10-03-page3.txt: [('illus-', 'illus'), ('pro-', 'pro'), ('prohibi-', 'prohibi'), ('SUB-', 'SUB'), ('attor-', 'attor'), ('-', '')] LibM19150701-V10-03-page30.txt: [('princi-', 'princi')] LibM19150701-V10-03-page33.txt: [('clas-', 'clas')] LibM19150701-V10-03-page36.txt: [('zeal-', 'zeal')] LibM19150701-V10-03-page4.txt: [('ad-', 'ad')] LibM19150701-V10-03-page42.txt: [('-', ''), ('-sAfd', 'sAfd'), ('-', '')] LibM19150701-V10-03-page45.txt: [('-', ''), ('-', '')] LibM19150701-V10-03-page46.txt: [('-', '')] LibM19150701-V10-03-page47.txt: [('-', ''), ('-', ''), ('caus-', 'caus')] LibM19150701-V10-03-page49.txt: [('-', '')] LibM19150701-V10-03-page50.txt: [('Fa-', 'Fa')] LibM19150701-V10-03-page8.txt: [('-', '')] LibM19151001-V10-04-page1.txt: [('-', '')] LibM19151001-V10-04-page10.txt: [('-', '')] LibM19151001-V10-04-page11.txt: [('lib-', 'lib')] LibM19151001-V10-04-page12.txt: [('-', '')] LibM19151001-V10-04-page14.txt: [('pub-', 'pub')] LibM19151001-V10-04-page15.txt: [('-', ''), ('-', '')] LibM19151001-V10-04-page18.txt: [('-', '')] LibM19151001-V10-04-page19.txt: [('-', ''), ('-', ''), ('dis-', 'dis')] LibM19151001-V10-04-page2.txt: [('-', ''), ('Col-', 'Col'), ('af-', 'af')] LibM19151001-V10-04-page23.txt: [('former-', 'former')] LibM19151001-V10-04-page25.txt: [('-', '')] LibM19151001-V10-04-page26.txt: [('bul-', 'bul'), ('to-', 'to')] LibM19151001-V10-04-page27.txt: [('-', ''), ('s-', 's'), ('-', '')] LibM19151001-V10-04-page28.txt: [('reli-', 'reli')] LibM19151001-V10-04-page30.txt: [('indi-', 'indi')] LibM19151001-V10-04-page31.txt: [('-proper', 'proper'), ('-', '')] LibM19151001-V10-04-page33.txt: [('-', '')] LibM19151001-V10-04-page37.txt: [('-', '')] LibM19151001-V10-04-page42.txt: [('Panama-', 'Panama'), ('repre-', 'repre')] LibM19151001-V10-04-page45.txt: [('-', '')] LibM19151001-V10-04-page48.txt: [('Ama-', 'Ama'), ('Eng-', 'Eng'), ('Bloom-', 'Bloom'), ('-', ''), ('go-', 'go'), ('Mili-', 'Mili')] LibM19151001-V10-04-page49.txt: [('effec-', 'effec'), ('per-', 'per')] LibM19151001-V10-04-page51.txt: [('Tem-', 'Tem')] LibM19151001-V10-04-page7.txt: [('-WARDE', 'WARDE')] LibM19160101-V11-01-page11.txt: [('legisla-', 'legisla'), ('Peru-', 'Peru')] LibM19160101-V11-01-page12.txt: [('bish-', 'bish')] LibM19160101-V11-01-page13.txt: [('-', ''), ('-', '')] LibM19160101-V11-01-page18.txt: [('institu-', 'institu')] LibM19160101-V11-01-page21.txt: [('-', '')] LibM19160101-V11-01-page23.txt: [('-', ''), ('-', ''), ('-', ''), ('lan-', 'lan')] LibM19160101-V11-01-page25.txt: [('prob-', 'prob')] LibM19160101-V11-01-page27.txt: [('be-', 'be')] LibM19160101-V11-01-page28.txt: [('-', '')] LibM19160101-V11-01-page30.txt: [('perni-', 'perni')] LibM19160101-V11-01-page35.txt: [('Postmaster-', 'Postmaster')] LibM19160101-V11-01-page36.txt: [('-I', 'I')] LibM19160101-V11-01-page4.txt: [('-', '')] LibM19160101-V11-01-page44.txt: [('-', ''), ('-', ''), ('-e....lft', 'e....lft'), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('......-', '......'), ('...-', '...'), ('-', ''), ('-', ''), ('-', ''), ('....-', '....'), ('-', ''), ('--', '-'), ('-', ''), ('".-r-', '".-r'), ('-', ''), ('-', ''), ('-', ''), ('-..', '..'), ("-'t", "'t"), ('-', ''), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('A-', 'A'), ('--', '-'), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-f...-V', 'f...-V'), ('--', '-'), ('-', ''), ('-', ''), ('-', '')] LibM19160101-V11-01-page45.txt: [('con-', 'con')] LibM19160101-V11-01-page5.txt: [('--', '-')] LibM19160101-V11-01-page6.txt: [('KEN-', 'KEN')] LibM19160101-V11-01-page8.txt: [('-', '')] LibM19160101-V11-01-page9.txt: [('--', '-')] LibM19160101-V11-01e-page11.txt: [('drug-', 'drug')] LibM19160101-V11-01e-page16.txt: [('-IN', 'IN'), ('repro-', 'repro'), ('en-', 'en'), ('mail-', 'mail'), ('-with', 'with'), ('pub-', 'pub'), ('assur-', 'assur'), ('mails-', 'mails')] LibM19160101-V11-01e-page3.txt: [('-', '')] LibM19160101-V11-01e-page4.txt: [('liberty-', 'liberty'), ('jury.--', 'jury.-'), ('-', '')] LibM19160101-V11-01e-page5.txt: [('senti-', 'senti')] LibM19160101-V11-01e-page9.txt: [('P-', 'P'), ('be-', 'be')] LibM19160401-V11-02-page1.txt: [('-', '')] LibM19160401-V11-02-page10.txt: [('OBSERV-', 'OBSERV'), ("'Na-", "'Na")] LibM19160401-V11-02-page13.txt: [('be-', 'be'), ('au-', 'au')] LibM19160401-V11-02-page14.txt: [('-', ''), ('persecution.--', 'persecution.-')] LibM19160401-V11-02-page16.txt: [('mat-', 'mat')] LibM19160401-V11-02-page17.txt: [('censor-', 'censor')] LibM19160401-V11-02-page18.txt: [('Corn-', 'Corn'), ('Postmaster-', 'Postmaster')] LibM19160401-V11-02-page2.txt: [('-', ''), ('"-', '"'), ('wor-', 'wor'), ('-', ''), ('-', ''), ('fore-', 'fore'), ('prop-', 'prop'), ('-', ''), ('scurril-', 'scurril'), ('mat-', 'mat'), ('decide.-', 'decide.')] LibM19160401-V11-02-page20.txt: [('-', '')] LibM19160401-V11-02-page22.txt: [('often-', 'often'), ('mat-', 'mat')] LibM19160401-V11-02-page24.txt: [('omis-', 'omis')] LibM19160401-V11-02-page26.txt: [('-', ''), ("'-.-", "'-."), ('-.--.-', '.--.-'), ('.-', '.'), ('.-', '.'), ('-f.', 'f.'), ('Or-', 'Or'), ('-', ''), ('-', ''), ('-.-', '.-'), ('-', ''), ("'-", "'"), ('-', ''), ('.....-', '.....'), ('r-', 'r'), ('-', ''), ('----', '---'), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ("-'", "'"), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('"-', '"'), ('--', '-'), ('------.---', '-----.---'), ('...-', '...'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('..-.-.-', '..-.-.'), ('f\'"-----', 'f\'"----'), ('-...-.', '...-.'), ('"-..-..-', '"-..-..'), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('--', '-'), ('-..-', '..-'), ('-.', '.'), ('-....', '....'), ("----'", "---'"), ('--..', '-..')] LibM19160401-V11-02-page28.txt: [('there-', 'there')] LibM19160401-V11-02-page29.txt: [('-', '')] LibM19160401-V11-02-page30.txt: [('Mc-', 'Mc')] LibM19160401-V11-02-page31.txt: [('.-', '.')] LibM19160401-V11-02-page32.txt: [('pri-', 'pri')] LibM19160401-V11-02-page33.txt: [('-', '')] LibM19160401-V11-02-page36.txt: [('liv-', 'liv')] LibM19160401-V11-02-page39.txt: [('recog-', 'recog')] LibM19160401-V11-02-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('mem-', 'mem'), ('there-', 'there')] LibM19160401-V11-02-page40.txt: [('en-', 'en'), ('-', '')] LibM19160401-V11-02-page43.txt: [('Pa-', 'Pa')] LibM19160401-V11-02-page45.txt: [('--', '-')] LibM19160401-V11-02-page48.txt: [('the-', 'the')] LibM19160401-V11-02-page49.txt: [('Mc-', 'Mc')] LibM19160401-V11-02-page5.txt: [('meas-', 'meas')] LibM19160401-V11-02-page51.txt: [('-', '')] LibM19160401-V11-02-page6.txt: [('-being', 'being')] LibM19160401-V11-02-page7.txt: [('-', '')] LibM19160401-V11-02-page8.txt: [('Congress-', 'Congress')] LibM19160401-V11-02-page9.txt: [('-', '')] LibM19160401-V11-02e-page1.txt: [('-', ''), ('-', '')] LibM19160401-V11-02e-page12.txt: [('Pot-', 'Pot')] LibM19160401-V11-02e-page14.txt: [('---', '--')] LibM19160401-V11-02e-page3.txt: [('-', '')] LibM19160401-V11-02e-page5.txt: [('-be', 'be')] LibM19160401-V11-02e-page9.txt: [('morals.--', 'morals.-')] LibM19160701-V11-03-page12.txt: [('-', '')] LibM19160701-V11-03-page14.txt: [('execu-', 'execu')] LibM19160701-V11-03-page15.txt: [('legit-', 'legit')] LibM19160701-V11-03-page16.txt: [('-Rest-in-Seven', 'Rest-in-Seven')] LibM19160701-V11-03-page18.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('sup-', 'sup')] LibM19160701-V11-03-page21.txt: [('-', ''), ('-', ''), ('Sec-', 'Sec')] LibM19160701-V11-03-page22.txt: [('-', '')] LibM19160701-V11-03-page23.txt: [('-', ''), ('Vat-', 'Vat')] LibM19160701-V11-03-page26.txt: [('-', ''), ('institu-', 'institu'), ('-', '')] LibM19160701-V11-03-page28.txt: [('-', ''), ('tem-', 'tem')] LibM19160701-V11-03-page29.txt: [('deci-', 'deci')] LibM19160701-V11-03-page3.txt: [('-', '')] LibM19160701-V11-03-page30.txt: [('remem-', 'remem')] LibM19160701-V11-03-page31.txt: [('dis-', 'dis'), ('-United', 'United'), ('cur-', 'cur')] LibM19160701-V11-03-page32.txt: [('-', '')] LibM19160701-V11-03-page33.txt: [('-', '')] LibM19160701-V11-03-page34.txt: [('-', '')] LibM19160701-V11-03-page35.txt: [('-', '')] LibM19160701-V11-03-page40.txt: [('-from', 'from')] LibM19160701-V11-03-page41.txt: [('-', '')] LibM19160701-V11-03-page43.txt: [('-', '')] LibM19160701-V11-03-page49.txt: [('Postmaster-', 'Postmaster'), ('Postmaster-', 'Postmaster')] LibM19160701-V11-03-page51.txt: [('-', '')] LibM19160701-V11-03-page9.txt: [('-', '')] LibM19161001-V11-04-page1.txt: [('-', '')] LibM19161001-V11-04-page12.txt: [('superstitions."--', 'superstitions."-')] LibM19161001-V11-04-page15.txt: [('CHAR-', 'CHAR')] LibM19161001-V11-04-page16.txt: [('-', '')] LibM19161001-V11-04-page17.txt: [('Watch-', 'Watch'), ('Postmaster-', 'Postmaster')] LibM19161001-V11-04-page2.txt: [('inter-', 'inter')] LibM19161001-V11-04-page20.txt: [('through-', 'through')] LibM19161001-V11-04-page21.txt: [('-', '')] LibM19161001-V11-04-page22.txt: [('as-', 'as'), ('-', '')] LibM19161001-V11-04-page23.txt: [('-', '')] LibM19161001-V11-04-page24.txt: [('unde-', 'unde'), ('observ-', 'observ')] LibM19161001-V11-04-page25.txt: [('mil-', 'mil')] LibM19161001-V11-04-page26.txt: [('-', ''), ('suf-', 'suf')] LibM19161001-V11-04-page27.txt: [('right-', 'right')] LibM19161001-V11-04-page33.txt: [('-', '')] LibM19161001-V11-04-page36.txt: [('stir-', 'stir')] LibM19161001-V11-04-page39.txt: [('-revived', 'revived')] LibM19161001-V11-04-page40.txt: [('how-', 'how')] LibM19161001-V11-04-page41.txt: [('denomi-', 'denomi'), ('-', ''), ('re-', 're')] LibM19161001-V11-04-page44.txt: [('govern-', 'govern'), ('.-', '.'), ('-', '')] LibM19161001-V11-04-page45.txt: [('ac-', 'ac')] LibM19161001-V11-04-page47.txt: [('voy-', 'voy')] LibM19161001-V11-04-page49.txt: [('Anti-', 'Anti')] LibM19161001-V11-04-page50.txt: [('-', '')] LibM19161001-V11-04-page52.txt: [('T-', 'T')] LibM19161001-V11-04-page6.txt: [('ex-', 'ex')] LibM19170101-V12-01-page1.txt: [('-', ''), ('-', '')] LibM19170101-V12-01-page13.txt: [('-', '')] LibM19170101-V12-01-page14.txt: [('dis-', 'dis')] LibM19170101-V12-01-page16.txt: [('-', '')] LibM19170101-V12-01-page19.txt: [('-', ''), ('-', ''), ('DE-', 'DE')] LibM19170101-V12-01-page2.txt: [('inter-', 'inter'), ('af-', 'af'), ('Ten-', 'Ten'), ('Wat-', 'Wat')] LibM19170101-V12-01-page23.txt: [('-legislation', 'legislation')] LibM19170101-V12-01-page27.txt: [('-', '')] LibM19170101-V12-01-page3.txt: [('Sab-', 'Sab')] LibM19170101-V12-01-page34.txt: [('religious-', 'religious'), ('-', '')] LibM19170101-V12-01-page6.txt: [('-as', 'as'), ('re-', 're')] LibM19170101-V12-01-page7.txt: [('un-', 'un')] LibM19170101-V12-01-page9.txt: [('-', '')] LibM19170401-V12-02-page10.txt: [('work-', 'work')] LibM19170401-V12-02-page11.txt: [('praise-', 'praise')] LibM19170401-V12-02-page18.txt: [('non-', 'non')] LibM19170401-V12-02-page21.txt: [('valid-', 'valid')] LibM19170401-V12-02-page23.txt: [('-', '')] LibM19170401-V12-02-page27.txt: [('founda-', 'founda')] LibM19170401-V12-02-page29.txt: [('recog-', 'recog')] LibM19170401-V12-02-page30.txt: [('Attorney-', 'Attorney')] LibM19170401-V12-02-page33.txt: [('-observance', 'observance')] LibM19170401-V12-02-page34.txt: [('-', ''), ('Multi-', 'Multi')] LibM19170401-V12-02-page35.txt: [('-', ''), ('alfigent-', 'alfigent')] LibM19170401-V12-02-page5.txt: [('-', ''), ('at-', 'at')] LibM19170401-V12-02-page7.txt: [('-', '')] LibM19170401-V12-02-page8.txt: [('in-', 'in')] LibM19170701-V12-03-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-.....', '.....')] LibM19170701-V12-03-page12.txt: [('govern-', 'govern')] LibM19170701-V12-03-page13.txt: [('-', '')] LibM19170701-V12-03-page14.txt: [('un-', 'un')] LibM19170701-V12-03-page15.txt: [('.-', '.'), ('-', '')] LibM19170701-V12-03-page19.txt: [('r.nr--', 'r.nr-')] LibM19170701-V12-03-page2.txt: [('Mis-', 'Mis')] LibM19170701-V12-03-page20.txt: [('III.-', 'III.'), ('CXXX.-', 'CXXX.'), ('gover-', 'gover')] LibM19170701-V12-03-page23.txt: [('---', '--'), ('---', '--')] LibM19170701-V12-03-page26.txt: [('-', '')] LibM19170701-V12-03-page28.txt: [('-', '')] LibM19170701-V12-03-page29.txt: [('-', '')] LibM19170701-V12-03-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19170701-V12-03-page31.txt: [('-', '')] LibM19170701-V12-03-page32.txt: [('-', '')] LibM19170701-V12-03-page33.txt: [('-', ''), ('-', ''), ('-', '')] LibM19170701-V12-03-page36.txt: [('-', ''), ('-', ''), ('.-', '.')] LibM19170701-V12-03-page8.txt: [('-', '')] LibM19170701-V12-03-page9.txt: [('-', '')] LibM19171001-V12-04-page10.txt: [('POR-', 'POR'), ('CON-', 'CON')] LibM19171001-V12-04-page11.txt: [('suav-', 'suav'), ('-their', 'their'), ('unlim-', 'unlim')] LibM19171001-V12-04-page12.txt: [('-', ''), ('Medo-', 'Medo'), ('-', '')] LibM19171001-V12-04-page13.txt: [('-', '')] LibM19171001-V12-04-page16.txt: [('-', '')] LibM19171001-V12-04-page18.txt: [('P-', 'P'), ('Protestant-', 'Protestant'), ('-o', 'o')] LibM19171001-V12-04-page19.txt: [('P-', 'P')] LibM19171001-V12-04-page21.txt: [('effec-', 'effec')] LibM19171001-V12-04-page23.txt: [('-', ''), ('lines.-', 'lines.')] LibM19171001-V12-04-page27.txt: [('-', '')] LibM19171001-V12-04-page28.txt: [('under-', 'under'), ('un-', 'un')] LibM19171001-V12-04-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19171001-V12-04-page30.txt: [('-r', 'r')] LibM19171001-V12-04-page34.txt: [('Alleghanies.-', 'Alleghanies.')] LibM19171001-V12-04-page35.txt: [('Food-', 'Food')] LibM19171001-V12-04-page7.txt: [('.-', '.')] LibM19180101-V13-01-page1.txt: [('ress--', 'ress-'), ('er-', 'er')] LibM19180101-V13-01-page11.txt: [('--', '-')] LibM19180101-V13-01-page12.txt: [('intro-', 'intro'), ('con-', 'con')] LibM19180101-V13-01-page17.txt: [('Postmaster-', 'Postmaster'), ('deter-', 'deter')] LibM19180101-V13-01-page19.txt: [('ar-', 'ar'), ('-', '')] LibM19180101-V13-01-page24.txt: [('power-', 'power'), ('Ars--', 'Ars-'), ('-', ''), ('enfranchise-', 'enfranchise')] LibM19180101-V13-01-page28.txt: [('-', '')] LibM19180101-V13-01-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19180101-V13-01-page31.txt: [('re-', 're')] LibM19180101-V13-01-page33.txt: [('Anti-', 'Anti')] LibM19180101-V13-01-page4.txt: [('-', ''), ('-', ''), ('-C', 'C'), ('-s', 's'), ('-', '')] LibM19180101-V13-01-page6.txt: [('-this', 'this'), ('foun-', 'foun')] LibM19180101-V13-01-page7.txt: [('can-', 'can')] LibM19180101-V13-01-page8.txt: [('Rear-', 'Rear')] LibM19180401-V13-02-page1.txt: [('-', '')] LibM19180401-V13-02-page12.txt: [('Go-to-', 'Go-to')] LibM19180401-V13-02-page13.txt: [('.---', '.--'), ('--', '-'), ('-s-', 's-'), ('con-', 'con'), ("-to'i.", "to'i."), ('.-', '.'), ('----', '---'), ('---', '--'), ('--', '-'), ('---', '--'), ('-', ''), ('.-.-', '.-.'), ('-', ''), ('-.', '.'), ('-', ''), ('---', '--'), ('Eng-', 'Eng'), ('-', ''), ('..--', '..-'), ('.f..--', '.f..-'), ('-', ''), ('......--', '......-'), ('---', '--'), ('-.', '.'), ('---', '--'), ('-', ''), ("-----'---..-", "----'---..-"), ('-...', '...'), ('.-.-.-.-', '.-.-.-.'), ('-', ''), ('.-', '.'), ('-.-."', '.-."')] LibM19180401-V13-02-page16.txt: [('Je-', 'Je')] LibM19180401-V13-02-page17.txt: [('-', '')] LibM19180401-V13-02-page19.txt: [('free-', 'free')] LibM19180401-V13-02-page20.txt: [('-', ''), ('-', '')] LibM19180401-V13-02-page21.txt: [('-', ''), ('-', '')] LibM19180401-V13-02-page22.txt: [('-', ''), ('-Palestine', 'Palestine'), ('-', '')] LibM19180401-V13-02-page23.txt: [('-', ''), ('-', ''), ('-', '')] LibM19180401-V13-02-page24.txt: [('MASSA-', 'MASSA')] LibM19180401-V13-02-page26.txt: [('Co-', 'Co')] LibM19180401-V13-02-page28.txt: [('-', ''), ('-', '')] LibM19180401-V13-02-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19180401-V13-02-page30.txt: [('-', '')] LibM19180401-V13-02-page35.txt: [('-', ''), ('-', ''), ('-', ''), ('r-', 'r'), ('---', '--')] LibM19180401-V13-02-page5.txt: [('thered.-', 'thered.')] LibM19180701-V13-03-page1.txt: [('-', '')] LibM19180701-V13-03-page14.txt: [('-', '')] LibM19180701-V13-03-page16.txt: [('-', ''), ('-', '')] LibM19180701-V13-03-page17.txt: [('-', '')] LibM19180701-V13-03-page18.txt: [('af-', 'af')] LibM19180701-V13-03-page25.txt: [('apes-', 'apes')] LibM19180701-V13-03-page29.txt: [('Jean-', 'Jean')] LibM19180701-V13-03-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19180701-V13-03-page30.txt: [('time..--', 'time..-')] LibM19180701-V13-03-page34.txt: [('cog-', 'cog')] LibM19180701-V13-03-page6.txt: [('fol-', 'fol')] LibM19180701-V13-03-page8.txt: [('-', '')] LibM19181001-V13-04-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19181001-V13-04-page11.txt: [('lit-', 'lit'), ('Sun-', 'Sun'), ('Ordi-', 'Ordi')] LibM19181001-V13-04-page13.txt: [('democ-', 'democ'), ('it-', 'it')] LibM19181001-V13-04-page14.txt: [('af-', 'af')] LibM19181001-V13-04-page16.txt: [('ap-', 'ap')] LibM19181001-V13-04-page18.txt: [('fore-', 'fore')] LibM19181001-V13-04-page19.txt: [('auto-', 'auto')] LibM19181001-V13-04-page2.txt: [('-.', '.'), ('.-', '.'), ('pre-', 'pre'), ('Ida-', 'Ida'), ('af-', 'af')] LibM19181001-V13-04-page20.txt: [('peril-', 'peril'), ('be-', 'be')] LibM19181001-V13-04-page23.txt: [('-', '')] LibM19181001-V13-04-page24.txt: [('-', ''), ('-', '')] LibM19181001-V13-04-page25.txt: [('-ruled', 'ruled')] LibM19181001-V13-04-page28.txt: [('-', '')] LibM19181001-V13-04-page29.txt: [('-', ''), ('Assoeia-', 'Assoeia')] LibM19181001-V13-04-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19181001-V13-04-page30.txt: [('One-', 'One')] LibM19181001-V13-04-page34.txt: [('reli-', 'reli')] LibM19181001-V13-04-page5.txt: [('call-', 'call')] LibM19181001-V13-04-page6.txt: [('-', ''), ('-', '')] LibM19181001-V13-04-page7.txt: [('en-', 'en')] LibM19181001-V13-04-page8.txt: [('Declara-', 'Declara')] LibM19190101-V15-01-page1.txt: [('-', '')] LibM19190101-V15-01-page12.txt: [('-A', 'A')] LibM19190101-V15-01-page15.txt: [('-', ''), ('-', ''), ('-', '')] LibM19190101-V15-01-page17.txt: [('-', '')] LibM19190101-V15-01-page18.txt: [('Wil-', 'Wil')] LibM19190101-V15-01-page2.txt: [('Ida-', 'Ida'), ('T"-', 'T"')] LibM19190101-V15-01-page21.txt: [('-religions', 'religions')] LibM19190101-V15-01-page22.txt: [('-', '')] LibM19190101-V15-01-page23.txt: [('-', '')] LibM19190101-V15-01-page28.txt: [('-.', '.'), ('i"----', 'i"---'), ('-j', 'j'), ('-', ''), ('-', ''), ('-e-', 'e-'), ('-', ''), ('-of', 'of')] LibM19190101-V15-01-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19190101-V15-01-page5.txt: [('thereof.-', 'thereof.'), ('COUN-', 'COUN'), ('-', '')] LibM19190401-V15-02-page1.txt: [('-', '')] LibM19190401-V15-02-page10.txt: [('..-', '..'), ('pro-', 'pro')] LibM19190401-V15-02-page12.txt: [('i-', 'i'), ('....-', '....')] LibM19190401-V15-02-page13.txt: [('neigh-', 'neigh')] LibM19190401-V15-02-page15.txt: [('to-', 'to'), ('-ether.', 'ether.')] LibM19190401-V15-02-page17.txt: [('-', ''), ('RE-', 'RE'), ('pro-', 'pro')] LibM19190401-V15-02-page18.txt: [('Medo-', 'Medo'), ('constrain-', 'constrain')] LibM19190401-V15-02-page19.txt: [('repub-', 'repub'), ('inter-', 'inter'), ('power-', 'power'), ('Dan-', 'Dan'), ('-', '')] LibM19190401-V15-02-page2.txt: [('-cl', 'cl')] LibM19190401-V15-02-page21.txt: [('jit-', 'jit')] LibM19190401-V15-02-page22.txt: [('-', '')] LibM19190401-V15-02-page23.txt: [('-', '')] LibM19190401-V15-02-page28.txt: [('-', '')] LibM19190401-V15-02-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19190401-V15-02-page7.txt: [('major-', 'major')] LibM19190401-V15-02-page9.txt: [('-.', '.')] LibM19190701-V15-03-page10.txt: [('-', '')] LibM19190701-V15-03-page11.txt: [('Re-', 'Re')] LibM19190701-V15-03-page12.txt: [('-', ''), ('pun-', 'pun')] LibM19190701-V15-03-page13.txt: [('con-', 'con')] LibM19190701-V15-03-page18.txt: [('com-', 'com')] LibM19190701-V15-03-page2.txt: [('inter-', 'inter'), ('Of-', 'Of'), ('affil-', 'affil'), ('Co-', 'Co')] LibM19190701-V15-03-page21.txt: [('-E', 'E'), ('Ite-', 'Ite'), ('-', ''), ('-', ''), ('-', ''), ('pa-', 'pa'), ('-', '')] LibM19190701-V15-03-page22.txt: [('-', '')] LibM19190701-V15-03-page25.txt: [('---', '--'), ('-r--', 'r--'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19190701-V15-03-page28.txt: [('President.\'"--', 'President.\'"-'), ('---', '--')] LibM19190701-V15-03-page30.txt: [('believ-', 'believ')] LibM19190701-V15-03-page31.txt: [('op-', 'op')] LibM19190701-V15-03-page34.txt: [('-', '')] LibM19190701-V15-03-page36.txt: [('f-', 'f'), ('-ewikik', 'ewikik')] LibM19190701-V15-03-page5.txt: [('Kt-', 'Kt'), ('-.', '.'), ('--', '-'), ('--', '-'), ('-', '')] LibM19190701-V15-03-page6.txt: [('--', '-')] LibM19190701-V15-03-page7.txt: [('Jef-', 'Jef')] LibM19190701-V15-03-page9.txt: [('con-', 'con')] LibM19191001-V15-04-page11.txt: [('Con-', 'Con')] LibM19191001-V15-04-page15.txt: [('-', ''), ('-B.', 'B.')] LibM19191001-V15-04-page17.txt: [('-', ''), ('-', ''), ('non-', 'non'), ('en-', 'en')] LibM19191001-V15-04-page21.txt: [('sur-', 'sur')] LibM19191001-V15-04-page27.txt: [('-', ''), ('-', '')] LibM19191001-V15-04-page28.txt: [('f-', 'f')] LibM19191001-V15-04-page5.txt: [('-', '')] LibM19191001-V15-04-page7.txt: [('pub-', 'pub')] LibM19191001-V15-04-page8.txt: [('Massa-', 'Massa'), ('re-', 're')] LibM19200101-V14-01-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] LibM19200101-V14-01-page12.txt: [('ESTAB-', 'ESTAB')] LibM19200101-V14-01-page15.txt: [('-', '')] LibM19200101-V14-01-page16.txt: [('-', ''), ('-', '')] LibM19200101-V14-01-page17.txt: [('reli-', 'reli'), ('MUTCH-', 'MUTCH')] LibM19200101-V14-01-page18.txt: [('cir-', 'cir')] LibM19200101-V14-01-page20.txt: [('pa-', 'pa')] LibM19200101-V14-01-page22.txt: [('-', ''), ('-', ''), ('-', ''), ('Na-', 'Na'), ('---', '--')] LibM19200101-V14-01-page32.txt: [('-', '')] LibM19200101-V14-01-page5.txt: [('fol-', 'fol')] LibM19200101-V14-01-page6.txt: [('-', ''), ('espe-', 'espe'), ('At-', 'At')] LibM19200101-V14-01-page7.txt: [('re-', 're'), ('-', '')] LibM19200101-V14-01-page8.txt: [('-', '')] LibM19200401-V14-02-page10.txt: [('ban-', 'ban')] LibM19200401-V14-02-page11.txt: [('denorai-', 'denorai'), ('-', '')] LibM19200401-V14-02-page13.txt: [('-', ''), ('-', ''), ('na-', 'na')] LibM19200401-V14-02-page14.txt: [('Sun-', 'Sun'), ('-', '')] LibM19200401-V14-02-page19.txt: [('Com-', 'Com'), ('-', '')] LibM19200401-V14-02-page21.txt: [('com-', 'com')] LibM19200401-V14-02-page22.txt: [('-', '')] LibM19200401-V14-02-page29.txt: [('-at', 'at')] LibM19200401-V14-02-page31.txt: [('---', '--'), ('a-', 'a')] LibM19200401-V14-02-page35.txt: [('--', '-'), ('-', ''), ('-', ''), ('--', '-'), ('kt-', 'kt')] LibM19200401-V14-02-page5.txt: [('thereof.-', 'thereof.'), ('-', ''), ('un-', 'un')] LibM19200401-V14-02-page6.txt: [('amuse-', 'amuse'), ('re-', 're'), ('--', '-')] LibM19200401-V14-02-page8.txt: [('Wheel-', 'Wheel'), ('advo-', 'advo')] LibM19200401-V14-02-page9.txt: [('com-', 'com')] LibM19200701-V14-03-page10.txt: [('an-', 'an')] LibM19200701-V14-03-page12.txt: [('-', ''), ('-', ''), ('-', ''), ('-.', '.')] LibM19200701-V14-03-page14.txt: [('unmistaka-', 'unmistaka'), ('ar-', 'ar'), ('-c', 'c')] LibM19200701-V14-03-page15.txt: [('-and', 'and'), ('-..', '..'), ('Sunday-', 'Sunday'), ('-iii', 'iii')] LibM19200701-V14-03-page16.txt: [('-', ''), ('Lib-', 'Lib')] LibM19200701-V14-03-page17.txt: [('-', ''), ('-', ''), ('-', ''), ('iVi-', 'iVi')] LibM19200701-V14-03-page2.txt: [('affil-', 'affil')] LibM19200701-V14-03-page20.txt: [('-', ''), ('-', ''), ('rafarowi-erivirorre-', 'rafarowi-erivirorre'), ('-mititayerwiriiiinicrierier-rimorwai-weiverreitaararforreahaarivitoroyerriiivii', 'mititayerwiriiiinicrierier-rimorwai-weiverreitaararforreahaarivitoroyerriiivii')] LibM19200701-V14-03-page24.txt: [('-', ''), ('--------', '-------')] LibM19200701-V14-03-page26.txt: [('un-', 'un')] LibM19200701-V14-03-page31.txt: [('-weesie', 'weesie'), ('Llimtstoo-', 'Llimtstoo'), ('plain-', 'plain')] LibM19200701-V14-03-page32.txt: [('-', '')] LibM19200701-V14-03-page33.txt: [('-', ''), ('Con-', 'Con')] LibM19200701-V14-03-page36.txt: [('-', ''), ('-', '')] LibM19200701-V14-03-page5.txt: [('-i-Ifidairicliiiriiirroi', 'i-Ifidairicliiiriiirroi')] LibM19200701-V14-03-page7.txt: [('-', '')] LibM19200701-V14-03-page8.txt: [('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-vol', 'vol'), ('-i-', 'i-'), ('wofriw-', 'wofriw'), ('-iv-.-ii.-', 'iv-.-ii.-')] LibM19201001-V14-04-page14.txt: [('Amer-', 'Amer')] LibM19201001-V14-04-page15.txt: [('un-', 'un')] LibM19201001-V14-04-page16.txt: [('-', ''), ('por-', 'por')] LibM19201001-V14-04-page18.txt: [('un-', 'un')] LibM19201001-V14-04-page19.txt: [('un-', 'un'), ('Fugitive-', 'Fugitive')] LibM19201001-V14-04-page21.txt: [('in-', 'in')] LibM19201001-V14-04-page22.txt: [('Vice-', 'Vice'), ('-of', 'of')] LibM19201001-V14-04-page23.txt: [('-', '')] LibM19201001-V14-04-page26.txt: [('-', '')] LibM19201001-V14-04-page27.txt: [('neg-', 'neg')] LibM19201001-V14-04-page30.txt: [('-', '')] LibM19201001-V14-04-page34.txt: [('en-', 'en')] LibM19201001-V14-04-page4.txt: [('-MASS.', 'MASS.')] LibM19201001-V14-04-page5.txt: [('-', '')] LibM19201001-V14-04-page6.txt: [('-', '')] LibM19201001-V14-04-page7.txt: [('Fed-', 'Fed'), ('n-', 'n'), ('---', '--'), ('"--r-f-', '"--r-f'), ('-', ''), ('eXti-', 'eXti')] LibM19201001-V14-04-page9.txt: [('-', '')]
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction3 Average verified rate: 0.9808327456835285 Average of error rates: 0.03449303008070433 Total token count: 1452112
In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[("'", 1601), ('m', 1336), ('d', 1255), ('e', 1005), ('w', 956), ('t', 838), ('n', 784), ('r', 684), ('f', 634), ('g', 385), ('x', 271), ('u', 208), ('k', 192), ('tv', 150), ('th', 121), ('pa', 104), ('sunday-law', 92), ('re', 89), ('z', 82), ('ex', 77), ('co', 74), ('io', 72), ('id', 71), ('postmaster-general', 62), ('mo', 62), ('ga', 58), ('post-offices', 57), ('un', 57), ('un-american', 57), ('va', 56), ('statute-books', 56), ('sunday-closing', 54), ('church-and-state', 49), ('tion', 45), ('mm', 45), ('q', 44), ('li', 43), ('mt', 42), ('attorney-general', 41), ('sunday-rest', 39), ('wm', 38), ('pp', 38), ('mi', 37), ('charta', 37), ('ro', 37), ('mc', 33), ('ri', 31), ('neander', 31), ('al', 31), ('-', 30)]
Correction 4 -- Remove extra quotation marks¶
In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = cycle
cycle = "correction4"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
corrections = []
for token in tokens:
token_list = list(token)
last_char = token_list[-1]
if last_char is "'":
if len(token) > 1:
if token_list[-2] is 's' or 'S':
pass
else:
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
elif token[0] is "'":
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
if len(corrections) > 0:
print('{}: {}'.format(filename, corrections))
for correction in corrections:
content = clean.replace_pair(correction, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
LibM19060401-V01-01-page20.txt: [("'bound", 'bound')] LibM19060401-V01-01-page22.txt: [("'co", 'co')] LibM19060401-V01-01-page25.txt: [("'brought", 'brought')] LibM19060401-V01-01-page29.txt: [("'Sunday", 'Sunday'), ("'hundred", 'hundred')] LibM19060401-V01-01-page31.txt: [("'Concerning", 'Concerning'), ("'Connecticut", 'Connecticut'), ("'hearkened", 'hearkened'), ("'brother", 'brother'), ("'bow", 'bow')] LibM19060401-V01-01-page33.txt: [("'CORTELYOU", 'CORTELYOU')] LibM19060401-V01-01-page34.txt: [("'advocate", 'advocate'), ("'and", 'and'), ("'contrast", 'contrast')] LibM19060401-V01-01-page35.txt: [("'e", 'e')] LibM19060401-V01-01-page7.txt: [("'belief", 'belief')] LibM19060401-V01-01-page8.txt: [("'Caesar", 'Caesar')] LibM19060401-V01-01-page9.txt: [("'by", 'by')] LibM19060701-V01-02-page21.txt: [("'twixt", 'twixt')] LibM19061001-V01-03-page18.txt: [("'corresponding", 'corresponding')] LibM19061001-V01-03-page22.txt: [('\'"', '"')] LibM19061001-V01-03-page23.txt: [("'fragile", 'fragile'), ("'Where", 'Where')] LibM19061001-V01-03-page24.txt: [("'ir", 'ir')] LibM19061001-V01-03-page25.txt: [("'Brewer", 'Brewer'), ("'factories", 'factories'), ("'prohibited", 'prohibited')] LibM19061001-V01-03-page29.txt: [("'of", 'of')] LibM19061001-V01-03-page32.txt: [("'Judge", 'Judge'), ("''Sou.", 'Sou.')] LibM19061001-V01-03-page8.txt: [("'to", 'to')] LibM19070101-V02-01-page12.txt: [("'.", '.')] LibM19070101-V02-01-page17.txt: [("'it", 'it')] LibM19070101-V02-01-page20.txt: [("'work", 'work')] LibM19070101-V02-01-page24.txt: [("'monstrous", 'monstrous')] LibM19070101-V02-01-page32.txt: [("'Zealous", 'Zealous')] LibM19070101-V02-01-page34.txt: [("'burning", 'burning')] LibM19070401-V02-02-page12.txt: [("'on", 'on')] LibM19070401-V02-02-page2.txt: [("'twas", 'twas')] LibM19070401-V02-02-page25.txt: [("'they", 'they'), ("'the", 'the')] LibM19070401-V02-02-page27.txt: [("'il", 'il')] LibM19070401-V02-02-page28.txt: [("'said", 'said'), ("'They", 'They')] LibM19070401-V02-02-page29.txt: [("'voluntarily", 'voluntarily')] LibM19070401-V02-02-page3.txt: [("'Bless", 'Bless')] LibM19070401-V02-02-page6.txt: [("'disobey", 'disobey')] LibM19070701-V02-03-page1.txt: [("'a", 'a')] LibM19070701-V02-03-page13.txt: [("'Demand", 'Demand')] LibM19070701-V02-03-page25.txt: [("'to", 'to')] LibM19070701-V02-03-page27.txt: [("'earnest", 'earnest')] LibM19070701-V02-03-page31.txt: [("'uses", 'uses')] LibM19070701-V02-03-page33.txt: [("'one", 'one')] LibM19070701-V02-03-page6.txt: [("'time", 'time')] LibM19071001-V02-04-page11.txt: [("'Bishop", 'Bishop'), ("'press", 'press')] LibM19071001-V02-04-page31.txt: [("'I", 'I')] LibM19071001-V02-04-page50.txt: [("'legislation", 'legislation')] LibM19071001-V02-04-page51.txt: [("'Ier", 'Ier'), ("'pr..", 'pr..'), ("'Isom", 'Isom'), ("'rotor", 'rotor'), ("'.", '.'), ("'en", 'en'), ("'...", '...'), ("'rryn", 'rryn'), ("'hot", 'hot')] LibM19071001-V02-04-page7.txt: [("'because", 'because')] LibM19071001-V02-04-page8.txt: [("'In", 'In')] LibM19080101-V03-01-page22.txt: [("'religionaboveall", 'religionaboveall')] LibM19080101-V03-01-page23.txt: [("'of", 'of'), ("'tween", 'tween')] LibM19080101-V03-01-page42.txt: [("'the", 'the')] LibM19080401-V03-02-page1.txt: [('\'".', '".')] LibM19080401-V03-02-page18.txt: [("'A", 'A')] LibM19080401-V03-02-page31.txt: [("'or", 'or'), ("'from", 'from')] LibM19080401-V03-02-page33.txt: [("'Tis", 'Tis')] LibM19080701-V03-03-page11.txt: [("'ago", 'ago')] LibM19080701-V03-03-page43.txt: [("'Sunday", 'Sunday')] LibM19080701-V03-03-page52.txt: [("'um", 'um')] LibM19081001-V03-04-page33.txt: [("'we", 'we')] LibM19090101-V04-01-page15.txt: [("'honor", 'honor')] LibM19090101-V04-01-page44.txt: [("'s", 's')] LibM19090101-V04-01-page49.txt: [("'A", 'A')] LibM19090401-V04-02-page1.txt: [("''.", '.')] LibM19090401-V04-02-page2.txt: [("'.", '.'), ("'r", 'r')] LibM19090401-V04-02-page20.txt: [("'religious", 'religious')] LibM19090401-V04-02-page24.txt: [("'the", 'the')] LibM19090401-V04-02-page32.txt: [("'a", 'a')] LibM19090401-V04-02-page45.txt: [("'fio", 'fio'), ("'.", '.')] LibM19090401-V04-02-page48.txt: [("'UNTIL", 'UNTIL')] LibM19090401-V04-02-page49.txt: [("'A", 'A')] LibM19090401-V04-02-page51.txt: [("'ARIZ", 'ARIZ')] LibM19090701-V04-03-page1.txt: [("'''....", '....')] LibM19090701-V04-03-page38.txt: [("'moment", 'moment')] LibM19090701-V04-03-page43.txt: [("'for", 'for')] LibM19090701-V04-03-page44.txt: [("'WET'andtRIY", 'WETandtRIY')] LibM19090701-V04-03-page45.txt: [("'GIP", 'GIP')] LibM19090701-V04-03-page49.txt: [("'iples", 'iples')] LibM19091001-V04-04-page14.txt: [("'background.", 'background.')] LibM19091001-V04-04-page39.txt: [("'instance", 'instance')] LibM19091001-V04-04-page40.txt: [("'our", 'our')] LibM19091001-V04-04-page46.txt: [("'much", 'much')] LibM19091001-V04-04-page5.txt: [("'of", 'of')] LibM19100101-V05-01-page10.txt: [("'its", 'its')] LibM19100101-V05-01-page11.txt: [("'of", 'of'), ("'been", 'been')] LibM19100101-V05-01-page12.txt: [("'now", 'now')] LibM19100101-V05-01-page17.txt: [("'of", 'of')] LibM19100101-V05-01-page19.txt: [("'siderable", 'siderable')] LibM19100101-V05-01-page20.txt: [("'day", 'day')] LibM19100101-V05-01-page25.txt: [("'why", 'why')] LibM19100101-V05-01-page31.txt: [("'AMOR", 'AMOR')] LibM19100101-V05-01-page34.txt: [("'Such", 'Such')] LibM19100101-V05-01-page39.txt: [("'profound", 'profound')] LibM19100401-V05-02-page14.txt: [("'for", 'for')] LibM19100401-V05-02-page24.txt: [("'s", 's')] LibM19100401-V05-02-page27.txt: [("'resident", 'resident')] LibM19100401-V05-02-page28.txt: [("'from", 'from')] LibM19100401-V05-02-page32.txt: [("'for", 'for')] LibM19100401-V05-02-page7.txt: [("'doubt", 'doubt')] LibM19100701-V05-03-page1.txt: [("'PIN", 'PIN')] LibM19100701-V05-03-page44.txt: [("'together", 'together')] LibM19100701-V05-03-page9.txt: [("'demanding", 'demanding')] LibM19101001-V05-04-page10.txt: [("'direct", 'direct')] LibM19101001-V05-04-page34.txt: [("'of", 'of'), ("'replies", 'replies')] LibM19101001-V05-04-page36.txt: [("'Amore", 'Amore')] LibM19101001-V05-04-page49.txt: [("'MAGAZINE", 'MAGAZINE')] LibM19110101-V06-01-page10.txt: [("'wants", 'wants')] LibM19110101-V06-01-page15.txt: [("'demanding", 'demanding')] LibM19110101-V06-01-page17.txt: [("'shall", 'shall')] LibM19110101-V06-01-page34.txt: [("'debates", 'debates')] LibM19110101-V06-01-page35.txt: [("'with", 'with')] LibM19110101-V06-01-page37.txt: [("'uniting", 'uniting')] LibM19110101-V06-01-page39.txt: [("'as", 'as')] LibM19110101-V06-01-page42.txt: [("'Adventists", 'Adventists')] LibM19110101-V06-01-page45.txt: [("'neath", 'neath')] LibM19110101-V06-01-page49.txt: [("'St", 'St')] LibM19110101-V06-01-page5.txt: [("'enforce", 'enforce')] LibM19110101-V06-01-page9.txt: [("'by", 'by')] LibM19110401-V06-02-page1.txt: [("'apple", 'apple'), ("'ftIfl", 'ftIfl')] LibM19110401-V06-02-page26.txt: [("'science", 'science'), ("'of", 'of')] LibM19110701-V06-03-page2.txt: [("'liberty", 'liberty')] LibM19110701-V06-03-page21.txt: [("'.", '.')] LibM19110701-V06-03-page25.txt: [("'eagle", 'eagle')] LibM19110701-V06-03-page29.txt: [("'of", 'of')] LibM19110701-V06-03-page30.txt: [("'goo", 'goo')] LibM19110701-V06-03-page4.txt: [("'painaas", 'painaas')] LibM19110701-V06-03-page42.txt: [("'and", 'and')] LibM19110701-V06-03-page46.txt: [("'the", 'the')] LibM19111001-V06-04-page17.txt: [('\'"', '"')] LibM19111001-V06-04-page19.txt: [("'positively", 'positively'), ("'V", 'V')] LibM19111001-V06-04-page20.txt: [("'the", 'the')] LibM19111001-V06-04-page30.txt: [("'liberties", 'liberties')] LibM19111001-V06-04-page41.txt: [("'to", 'to')] LibM19111001-V06-04-page52.txt: [("'Writings", 'Writings')] LibM19120101-V07-01-page10.txt: [("'with", 'with'), ("'hardly", 'hardly')] LibM19120101-V07-01-page22.txt: [("'and", 'and')] LibM19120101-V07-01-page26.txt: [("'child", 'child')] LibM19120101-V07-01-page27.txt: [("'RESIDENT", 'RESIDENT')] LibM19120101-V07-01-page29.txt: [("'if", 'if')] LibM19120101-V07-01-page30.txt: [("'s", 's')] LibM19120101-V07-01-page31.txt: [("'be", 'be')] LibM19120401-V07-02-page28.txt: [("'rections", 'rections'), ("'effect", 'effect')] LibM19120401-V07-02-page32.txt: [("'Catholics", 'Catholics')] LibM19120401-V07-02-page35.txt: [("'of", 'of'), ("'be", 'be')] LibM19120701-V07-03-page4.txt: [("'.", '.'), ("'ma", 'ma'), ("'.Z....", '.Z....'), ("'..", '..'), ("'gut.", 'gut.'), ("'.", '.')] LibM19120701-V07-03-page40.txt: [("'with", 'with')] LibM19120701-V07-03-page42.txt: [("'of", 'of')] LibM19120701-V07-03-page51.txt: [("'GAZINE", 'GAZINE')] LibM19120701-V07-03-page52.txt: [("'Ne", 'Ne')] LibM19120701-V07-03-page8.txt: [("'twixt", 'twixt')] LibM19121001-V07-04-page26.txt: [("'tat", 'tat')] LibM19121001-V07-04-page6.txt: [("'.", '.'), ("'aroe", 'aroe'), ("'ammo", 'ammo'), ("'Meow", 'Meow')] LibM19130101-V08-01-page15.txt: [("'I-JAMES", 'I-JAMES')] LibM19130101-V08-01-page2.txt: [("'Religious", 'Religious')] LibM19130101-V08-01-page22.txt: [("'the", 'the')] LibM19130101-V08-01-page23.txt: [("'religious", 'religious')] LibM19130101-V08-01-page31.txt: [("'avoid", 'avoid')] LibM19130101-V08-01-page42.txt: [("'en.", 'en.'), ("'ode", 'ode')] LibM19130101-V08-01-page5.txt: [("'White", 'White')] LibM19130401-V08-02-page13.txt: [("'provided", 'provided')] LibM19130401-V08-02-page31.txt: [("'of", 'of')] LibM19130401-V08-02-page32.txt: [("'to", 'to')] LibM19130401-V08-02-page34.txt: [("'let", 'let')] LibM19130401-V08-02-page38.txt: [("'Tis", 'Tis')] LibM19130401-V08-02-page49.txt: [("'Society", 'Society')] LibM19130701-V08-03-page27.txt: [("'C.'''.", 'C..'), ("'.....", '.....')] LibM19130701-V08-03-page50.txt: [("'wishing", 'wishing')] LibM19130701-V08-03-page51.txt: [("'WASH", 'WASH')] LibM19131001-V08-04-page12.txt: [("'first", 'first')] LibM19131001-V08-04-page13.txt: [("'of", 'of')] LibM19131001-V08-04-page25.txt: [("'ay", 'ay')] LibM19131001-V08-04-page3.txt: [("'This", 'This'), ("'.", '.')] LibM19131001-V08-04-page4.txt: [("'OVID.", 'OVID.')] LibM19131001-V08-04-page41.txt: [("'so", 'so')] LibM19131001-V08-04-page5.txt: [("'on", 'on')] LibM19131001-V08-04-page52.txt: [("'.", '.')] LibM19140101-V09-01-page14.txt: [("'give", 'give')] LibM19140101-V09-01-page15.txt: [("'just", 'just')] LibM19140101-V09-01-page23.txt: [("'and", 'and')] LibM19140101-V09-01-page31.txt: [("'i'i", 'ii'), ("'I.", 'I.'), ("'..i", '..i')] LibM19140101-V09-01-page42.txt: [("'God", 'God')] LibM19140101-V09-01-page43.txt: [("'once", 'once')] LibM19140101-V09-01-page56.txt: [("'VA", 'VA'), ("'Nit", 'Nit')] LibM19140401-V09-02-page15.txt: [("'contrary", 'contrary')] LibM19140401-V09-02-page20.txt: [("'state", 'state')] LibM19140401-V09-02-page23.txt: [("'shave", 'shave')] LibM19140401-V09-02-page4.txt: [("''t", 't')] LibM19140701-V09-03-page11.txt: [("'tis", 'tis')] LibM19140701-V09-03-page18.txt: [("'prohibit", 'prohibit')] LibM19140701-V09-03-page19.txt: [("'The", 'The')] LibM19140701-V09-03-page26.txt: [("'orris", 'orris')] LibM19140701-V09-03-page28.txt: [("'the", 'the')] LibM19140701-V09-03-page29.txt: [("'as", 'as')] LibM19140701-V09-03-page36.txt: [("'riot", 'riot')] LibM19140701-V09-03-page37.txt: [("'The", 'The')] LibM19140701-V09-03-page4.txt: [("'UT", 'UT'), ("'esired.", 'esired.')] LibM19140701-V09-03-page40.txt: [("'elected", 'elected')] LibM19140701-V09-03-page45.txt: [("'in", 'in')] LibM19140701-V09-03-page49.txt: [("'t", 't'), ("'t", 't'), ('\'\'"', '"'), ("'TX", 'TX')] LibM19140701-V09-03-page51.txt: [("'mon", 'mon')] LibM19141001-V09-04-page11.txt: [("'a", 'a')] LibM19141001-V09-04-page13.txt: [("'now", 'now')] LibM19141001-V09-04-page24.txt: [("'IiE", 'IiE')] LibM19141001-V09-04-page32.txt: [("'hung", 'hung')] LibM19141001-V09-04-page35.txt: [("'IV", 'IV')] LibM19141001-V09-04-page40.txt: [("'enjoy", 'enjoy')] LibM19141001-V09-04-page45.txt: [("'the", 'the')] LibM19141001-V09-04-page46.txt: [("'act", 'act')] LibM19141001-V09-04-page50.txt: [("'M", 'M'), ("'N", 'N'), ("'C", 'C')] LibM19141001-V09-04-page51.txt: [("'.", '.')] LibM19141001-V09-04-page52.txt: [("'AK.", 'AK.'), ("'CY", 'CY')] LibM19141001-V09-04-page9.txt: [("'at", 'at')] LibM19150101-V10-01-page11.txt: [("'Liberty", 'Liberty')] LibM19150101-V10-01-page14.txt: [("'a", 'a')] LibM19150101-V10-01-page21.txt: [("'thus", 'thus')] LibM19150101-V10-01-page24.txt: [("'the", 'the'), ("'to", 'to')] LibM19150101-V10-01-page34.txt: [("'Tis", 'Tis')] LibM19150101-V10-01-page38.txt: [("'fallacy", 'fallacy')] LibM19150101-V10-01-page48.txt: [("'thereby", 'thereby')] LibM19150101-V10-01-page52.txt: [("'M", 'M')] LibM19150101-V10-01-page53.txt: [("'comet", 'comet'), ("'Protestant", 'Protestant')] LibM19150401-V10-02-page19.txt: [("'directed", 'directed')] LibM19150401-V10-02-page22.txt: [("'recourse", 'recourse')] LibM19150401-V10-02-page36.txt: [("'s", 's')] LibM19150401-V10-02-page39.txt: [("'Upon", 'Upon')] LibM19150401-V10-02-page46.txt: [("'O", 'O')] LibM19150401-V10-02-page6.txt: [("'IMN", 'IMN'), ('\'"Ar', '"Ar')] LibM19150701-V10-03-page12.txt: [("'at", 'at')] LibM19150701-V10-03-page15.txt: [("'twixti", 'twixti')] LibM19150701-V10-03-page20.txt: [("'citizens", 'citizens')] LibM19150701-V10-03-page26.txt: [("'Ipon", 'Ipon')] LibM19150701-V10-03-page33.txt: [("'defend", 'defend')] LibM19150701-V10-03-page42.txt: [("'a", 'a')] LibM19150701-V10-03-page43.txt: [("'in", 'in')] LibM19151001-V10-04-page11.txt: [("'publish", 'publish')] LibM19151001-V10-04-page20.txt: [("'Part", 'Part')] LibM19151001-V10-04-page21.txt: [("'personal", 'personal')] LibM19151001-V10-04-page22.txt: [("'duty", 'duty')] LibM19151001-V10-04-page25.txt: [("'buries", 'buries')] LibM19151001-V10-04-page28.txt: [("'Twixt", 'Twixt')] LibM19151001-V10-04-page47.txt: [("'immutable", 'immutable')] LibM19151001-V10-04-page51.txt: [("'Vs", 'Vs')] LibM19160101-V11-01-page11.txt: [("'union", 'union')] LibM19160101-V11-01-page13.txt: [("'venerable", 'venerable')] LibM19160101-V11-01-page44.txt: [("'....", '....'), ('\'\'.....".', '.....".'), ("'JAC'V", 'JACV'), ("'i", 'i'), ("'.", '.'), ("''.", '.'), ("'ti", 'ti'), ("'t", 't'), ("'sr", 'sr'), ("'Ae.", 'Ae.')] LibM19160101-V11-01-page48.txt: [("'members", 'members')] LibM19160101-V11-01e-page16.txt: [("'Washington", 'Washington')] LibM19160101-V11-01e-page7.txt: [("'The", 'The')] LibM19160401-V11-02-page10.txt: [("'a", 'a'), ("'Na", 'Na')] LibM19160401-V11-02-page12.txt: [("'as", 'as')] LibM19160401-V11-02-page16.txt: [("'if", 'if')] LibM19160401-V11-02-page20.txt: [("'Company", 'Company')] LibM19160401-V11-02-page26.txt: [("'It", 'It')] LibM19160401-V11-02-page31.txt: [("'tis", 'tis'), ("'I", 'I')] LibM19160401-V11-02-page46.txt: [("'Traitors", 'Traitors'), ('\'"', '"')] LibM19160701-V11-03-page23.txt: [("'IM", 'IM')] LibM19160701-V11-03-page27.txt: [("'An", 'An')] LibM19160701-V11-03-page42.txt: [('\'"', '"')] LibM19160701-V11-03-page6.txt: [("'neath", 'neath')] LibM19161001-V11-04-page10.txt: [("'a", 'a')] LibM19161001-V11-04-page19.txt: [("'rest", 'rest')] LibM19161001-V11-04-page20.txt: [("'Illinois", 'Illinois')] LibM19161001-V11-04-page35.txt: [("'regarding", 'regarding')] LibM19161001-V11-04-page36.txt: [("'sent", 'sent')] LibM19161001-V11-04-page37.txt: [("'the", 'the')] LibM19161001-V11-04-page39.txt: [("'of", 'of')] LibM19161001-V11-04-page41.txt: [("'court", 'court')] LibM19170101-V12-01-page26.txt: [("'nternational", 'nternational')] LibM19170101-V12-01-page27.txt: [("'Duprey's", 'Dupreys'), ("'Moore", 'Moore')] LibM19170101-V12-01-page30.txt: [("'banishing", 'banishing')] LibM19170101-V12-01-page35.txt: [("'ts", 'ts')] LibM19170101-V12-01-page6.txt: [("'servile", 'servile')] LibM19170401-V12-02-page16.txt: [("'no", 'no')] LibM19170401-V12-02-page19.txt: [("'If", 'If')] LibM19170401-V12-02-page20.txt: [("'Twas", 'Twas')] LibM19170401-V12-02-page22.txt: [("'Tis", 'Tis')] LibM19170401-V12-02-page25.txt: [("'Tis", 'Tis')] LibM19170401-V12-02-page29.txt: [('\'"', '"')] LibM19170401-V12-02-page5.txt: [("'o", 'o')] LibM19170401-V12-02-page9.txt: [("'that", 'that')] LibM19170701-V12-03-page1.txt: [("'al", 'al')] LibM19170701-V12-03-page10.txt: [("'THE", 'THE')] LibM19170701-V12-03-page12.txt: [('\'s"', 's"')] LibM19170701-V12-03-page17.txt: [("'State", 'State')] LibM19170701-V12-03-page29.txt: [("'the", 'the')] LibM19171001-V12-04-page1.txt: [("'ublished", 'ublished')] LibM19171001-V12-04-page16.txt: [("'mounted", 'mounted')] LibM19171001-V12-04-page18.txt: [("'Luther", 'Luther'), ("'tboot", 'tboot')] LibM19171001-V12-04-page27.txt: [("'us", 'us')] LibM19171001-V12-04-page9.txt: [("'coordination", 'coordination'), ("'most", 'most')] LibM19180101-V13-01-page4.txt: [('\'"E', '"E'), ("'attr", 'attr')] LibM19180401-V13-02-page14.txt: [("'however", 'however')] LibM19180401-V13-02-page22.txt: [("'of", 'of')] LibM19180401-V13-02-page31.txt: [("'fields", 'fields')] LibM19180401-V13-02-page36.txt: [("'THE", 'THE')] LibM19180701-V13-03-page10.txt: [("'of", 'of')] LibM19180701-V13-03-page21.txt: [("'no", 'no')] LibM19180701-V13-03-page32.txt: [("'years", 'years')] LibM19180701-V13-03-page34.txt: [("'what", 'what'), ("'Tis", 'Tis')] LibM19181001-V13-04-page15.txt: [("'being", 'being')] LibM19181001-V13-04-page19.txt: [("'EMOCRACY", 'EMOCRACY')] LibM19181001-V13-04-page21.txt: [("'the", 'the')] LibM19181001-V13-04-page25.txt: [("'virtually", 'virtually')] LibM19190101-V15-01-page18.txt: [("'enforce", 'enforce')] LibM19190101-V15-01-page19.txt: [("'remain", 'remain')] LibM19190101-V15-01-page20.txt: [("'Oxtails", 'Oxtails')] LibM19190101-V15-01-page22.txt: [("'Sunday", 'Sunday')] LibM19190401-V15-02-page1.txt: [("'W", 'W')] LibM19190401-V15-02-page14.txt: [("'the", 'the')] LibM19190401-V15-02-page15.txt: [("'the", 'the'), ("'twixt", 'twixt')] LibM19190401-V15-02-page16.txt: [("'the", 'the')] LibM19190401-V15-02-page21.txt: [("'a", 'a')] LibM19190401-V15-02-page22.txt: [("'million", 'million')] LibM19190401-V15-02-page5.txt: [("'LE", 'LE'), ("'being", 'being')] LibM19190401-V15-02-page6.txt: [("'According", 'According')] LibM19190701-V15-03-page20.txt: [("'Presbyterian", 'Presbyterian')] LibM19190701-V15-03-page21.txt: [("'or", 'or')] LibM19190701-V15-03-page29.txt: [("'lewd", 'lewd')] LibM19190701-V15-03-page30.txt: [("'v", 'v'), ("'he", 'he')] LibM19190701-V15-03-page31.txt: [("'.", '.'), ("'the", 'the')] LibM19190701-V15-03-page32.txt: [("'United", 'United')] LibM19190701-V15-03-page33.txt: [("'and", 'and')] LibM19191001-V15-04-page15.txt: [('\'"', '"')] LibM19191001-V15-04-page18.txt: [("'five", 'five')] LibM19191001-V15-04-page7.txt: [("'purity", 'purity'), ("'by", 'by')] LibM19200101-V14-01-page1.txt: [("'IN", 'IN')] LibM19200101-V14-01-page6.txt: [("'s", 's')] LibM19200401-V14-02-page25.txt: [("'Volumes", 'Volumes')] LibM19200701-V14-03-page15.txt: [("'racTIMIriiiiitiriiltililietcliteiViiVittiiiitiEVAlifittiA", 'racTIMIriiiiitiriiltililietcliteiViiVittiiiitiEVAlifittiA')] LibM19200701-V14-03-page24.txt: [("'A", 'A')] LibM19200701-V14-03-page27.txt: [("'concerned.", 'concerned.')] LibM19200701-V14-03-page32.txt: [("'fourth", 'fourth')] LibM19200701-V14-03-page33.txt: [("'the", 'the')] LibM19200701-V14-03-page4.txt: [("'Twixt", 'Twixt')] LibM19201001-V14-04-page15.txt: [("'m", 'm')] LibM19201001-V14-04-page16.txt: [("'mannum", 'mannum')] LibM19201001-V14-04-page23.txt: [("'Signs", 'Signs'), ("'Signs", 'Signs'), ("'Cut", 'Cut'), ("'Signs", 'Signs')] LibM19201001-V14-04-page25.txt: [("'were", 'were')] LibM19201001-V14-04-page29.txt: [("'praise", 'praise'), ("'for", 'for')] LibM19201001-V14-04-page7.txt: [("'oppression.", 'oppression.')]
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction4 Average verified rate: 0.9811434974335735 Average of error rates: 0.03407373440939106 Total token count: 1452019
In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[("'", 1499), ('m', 1341), ('d', 1257), ('e', 1013), ('w', 957), ('t', 847), ('n', 787), ('r', 687), ('f', 634), ('g', 386), ('x', 271), ('u', 209), ('k', 192), ('tv', 150), ('th', 122), ('pa', 104), ('sunday-law', 92), ('re', 89), ('z', 83), ('ex', 77), ('co', 75), ('io', 72), ('id', 71), ('mo', 63), ('postmaster-general', 62), ('ga', 58), ('post-offices', 57), ('un', 57), ('un-american', 57), ('va', 57), ('statute-books', 56), ('sunday-closing', 54), ('church-and-state', 49), ('tion', 45), ('mm', 45), ('q', 44), ('li', 43), ('mt', 42), ('attorney-general', 41), ('sunday-rest', 39), ('wm', 38), ('ro', 38), ('pp', 38), ('mi', 37), ('charta', 37), ('mc', 33), ('al', 32), ('ri', 31), ('neander', 31), ('-', 30)]
Correction 5 -- Rejoin Burst Words¶
In [25]:
# %load shared_elements/rejoin_burst_words.py
prev = cycle
cycle = "correction5"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
pattern = re.compile("(\s(\w{1,2}\s){5,})")
replacements = []
clean.check_splits(pattern, spelling_dictionary, content, replacements)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
LibM19061001-V01-03-page17.txt: [(' r a t h e ', 'rathe')] LibM19061001-V01-03-page24.txt: [('To', 'To')] LibM19100101-V05-01-page22.txt: [('It', 'It')] LibM19100401-V05-02-page52.txt: [('El', 'El')] LibM19100701-V05-03-page19.txt: [(' f or w a r d\n', 'forward')] LibM19120401-V07-02-page46.txt: [('It', 'It')] LibM19121001-V07-04-page29.txt: [('As', 'As')] LibM19121001-V07-04-page5.txt: [('El', 'El')] LibM19150101-V10-01-page11.txt: [('To', 'To')] LibM19150101-V10-01-page4.txt: [('Lo', 'Lo')] LibM19150401-V10-02-page6.txt: [('\nU N U S U A L ', 'UNUSUAL')] LibM19150701-V10-03-page27.txt: [('It', 'It')] LibM19150701-V10-03-page47.txt: [(' m a n is a ', 'manisa')] LibM19170401-V12-02-page5.txt: [(' p r es en t ', 'present')] LibM19170701-V12-03-page16.txt: [('Is', 'Is')] LibM19200101-V14-01-page6.txt: [(' c an n o t ', 'cannot')]
In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction5 Average verified rate: 0.9811555435567139 Average of error rates: 0.0340564930300807 Total token count: 1451992
In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[("'", 1499), ('m', 1340), ('d', 1256), ('e', 1012), ('w', 956), ('t', 844), ('n', 784), ('r', 684), ('f', 633), ('g', 386), ('x', 271), ('u', 206), ('k', 192), ('tv', 150), ('th', 122), ('pa', 104), ('sunday-law', 92), ('re', 89), ('z', 83), ('ex', 77), ('co', 75), ('io', 72), ('id', 71), ('mo', 63), ('postmaster-general', 62), ('ga', 58), ('post-offices', 57), ('un', 57), ('un-american', 57), ('va', 57), ('statute-books', 56), ('sunday-closing', 54), ('church-and-state', 49), ('tion', 45), ('mm', 45), ('q', 44), ('li', 43), ('mt', 42), ('attorney-general', 41), ('sunday-rest', 39), ('wm', 38), ('ro', 38), ('pp', 38), ('mi', 37), ('charta', 37), ('mc', 33), ('al', 32), ('ri', 31), ('neander', 31), ('-', 30)]
Correction 6 -- Rejoin Split Words¶
In [28]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
LibM19060401-V01-01-page11.txt: [('Mc', 'Alister')] LibM19060401-V01-01-page35.txt: [('ri', 'e'), ('re', 'd'), ('ti', 'c')] LibM19061001-V01-03-page19.txt: [('Sabb', 'at')] LibM19061001-V01-03-page21.txt: [('destruc', 'tion')] LibM19061001-V01-03-page4.txt: [('spir', 'itual')] LibM19061001-V01-03-page6.txt: [('LIBERT', 'Y')] LibM19070101-V02-01-page2.txt: [('ti', 'A')] LibM19070101-V02-01-page23.txt: [('LIBE', 'RTY')] LibM19070101-V02-01-page6.txt: [('impor', 'tance')] LibM19070401-V02-02-page17.txt: [('LAN', 'CASTER')] LibM19070701-V02-03-page14.txt: [('Demi', 'god')] LibM19070701-V02-03-page18.txt: [('unfort', 'unately')] LibM19070701-V02-03-page23.txt: [('Mc', 'Alister')] LibM19071001-V02-04-page12.txt: [('approv', 'e')] LibM19071001-V02-04-page14.txt: [('MC', 'KENNA')] LibM19071001-V02-04-page20.txt: [('controv', 'ersy')] LibM19071001-V02-04-page38.txt: [('un', 'Christian'), ('Fr', 'eedom')] LibM19071001-V02-04-page46.txt: [('co', 'operation')] LibM19071001-V02-04-page48.txt: [('th', 'e')] LibM19071001-V02-04-page50.txt: [('co', 'respondents')] LibM19071001-V02-04-page51.txt: [('ren', 'al'), ('re', 'hob')] LibM19080101-V03-01-page20.txt: [('un', 'Christian')] LibM19080101-V03-01-page32.txt: [('ob', 'serve')] LibM19080101-V03-01-page41.txt: [('self-govern', 'ment')] LibM19080401-V03-02-page1.txt: [('pa', 'I')] LibM19080401-V03-02-page28.txt: [('Legis', 'lation')] LibM19080401-V03-02-page30.txt: [('upo', 'n')] LibM19080701-V03-03-page1.txt: [('ra', 'ff')] LibM19080701-V03-03-page20.txt: [('re', 'A')] LibM19080701-V03-03-page28.txt: [('Sund', 'a')] LibM19080701-V03-03-page37.txt: [('religi', 'o')] LibM19080701-V03-03-page46.txt: [('Northweste', 'r')] LibM19081001-V03-04-page11.txt: [('PHILA', 'DELPHIA'), ('WILLI', 'AM')] LibM19081001-V03-04-page15.txt: [('fi', 'e')] LibM19081001-V03-04-page19.txt: [('TH', 'E'), ('religi', 'o'), ('Por', 'e'), ('su', 'preme')] LibM19090101-V04-01-page28.txt: [('threate', 'n')] LibM19090101-V04-01-page33.txt: [('estab', 'lish')] LibM19090101-V04-01-page45.txt: [('Mc', "Clure's")] LibM19090401-V04-02-page11.txt: [('Bonif', 'ace')] LibM19090401-V04-02-page12.txt: [('Boni', 'face')] LibM19090401-V04-02-page2.txt: [('po', 'i')] LibM19090401-V04-02-page31.txt: [('MC', 'MILLAN')] LibM19090401-V04-02-page45.txt: [('eyo', 't')] LibM19090401-V04-02-page48.txt: [('fr', 'Ee')] LibM19090401-V04-02-page49.txt: [('om', 'a')] LibM19090401-V04-02-page50.txt: [('co', 'operate')] LibM19090401-V04-02-page52.txt: [('ma', "n's")] LibM19090401-V04-02-page8.txt: [('Mc', 'Dermott')] LibM19090401-V04-02-page9.txt: [('co', 'operation')] LibM19090701-V04-03-page10.txt: [('RECEP', 'TION')] LibM19090701-V04-03-page29.txt: [('Speakin', 'g')] LibM19090701-V04-03-page49.txt: [('Appe', 'als')] LibM19090701-V04-03-page7.txt: [('Mc', 'Crory')] LibM19091001-V04-04-page38.txt: [('si', 'n')] LibM19091001-V04-04-page47.txt: [('Appe', 'als')] LibM19091001-V04-04-page7.txt: [('Mc', 'Kinley')] LibM19100101-V05-01-page17.txt: [('th', 'at'), ('Mc', 'Kenna')] LibM19100101-V05-01-page19.txt: [('incon', 'siderable')] LibM19100101-V05-01-page26.txt: [('gua', 'ranteed')] LibM19100101-V05-01-page31.txt: [('SU', 'NDAY')] LibM19100101-V05-01-page39.txt: [('CA', 'Y')] LibM19100401-V05-02-page15.txt: [('uncon', 'fessed')] LibM19100401-V05-02-page23.txt: [('secre', 'tary')] LibM19100401-V05-02-page35.txt: [('PeRsECUTI', 'ON')] LibM19100401-V05-02-page52.txt: [('legisla', 'tor')] LibM19100401-V05-02-page6.txt: [('un', 'Christian')] LibM19100401-V05-02-page8.txt: [('PRESI', 'DENT')] LibM19100701-V05-03-page1.txt: [('wo', 'g'), ('UN', 'I')] LibM19100701-V05-03-page29.txt: [('al', 'ways')] LibM19100701-V05-03-page37.txt: [('HISTOR', 'ICAL'), ('ERRON', 'EOUS')] LibM19100701-V05-03-page45.txt: [('ch', 'ose')] LibM19100701-V05-03-page49.txt: [('PROTES', 'TANT'), ('Re', 'stated')] LibM19100701-V05-03-page7.txt: [('Mc', 'Kinley')] LibM19101001-V05-04-page15.txt: [('co', 'operate')] LibM19101001-V05-04-page28.txt: [('PA', 'L')] LibM19101001-V05-04-page39.txt: [('libert', 'y')] LibM19101001-V05-04-page49.txt: [('PROTES', 'TANT')] LibM19101001-V05-04-page50.txt: [('co', 'operation')] LibM19110101-V06-01-page1.txt: [('nU', 'M')] LibM19110101-V06-01-page12.txt: [('compuls', 'ion')] LibM19110101-V06-01-page18.txt: [('Mc', 'Donald')] LibM19110101-V06-01-page34.txt: [('consid', 'ered'), ('RE', 'LIGION')] LibM19110101-V06-01-page35.txt: [('shep', "herd's")] LibM19110101-V06-01-page48.txt: [('ta', 'king')] LibM19110101-V06-01-page5.txt: [('TI', 'E')] LibM19110401-V06-02-page1.txt: [('mo', 'Jo')] LibM19110401-V06-02-page12.txt: [('ment', 'on')] LibM19110701-V06-03-page14.txt: [('un', 'Christian')] LibM19110701-V06-03-page20.txt: [('WA', 'RTBURG')] LibM19110701-V06-03-page25.txt: [('republi', 'c')] LibM19110701-V06-03-page32.txt: [('religi', 'o')] LibM19110701-V06-03-page33.txt: [('Switzerlan', 'd')] LibM19110701-V06-03-page37.txt: [('religi', 'o')] LibM19110701-V06-03-page38.txt: [('reen', 'forced')] LibM19110701-V06-03-page50.txt: [('expe', 'rience')] LibM19111001-V06-04-page11.txt: [('religi', 'o')] LibM19111001-V06-04-page18.txt: [('Pontif', 'ex')] LibM19111001-V06-04-page38.txt: [('Co', 'n')] LibM19111001-V06-04-page43.txt: [('ma', 'king')] LibM19111001-V06-04-page52.txt: [('ec', 'clesiastical'), ('kl', 'EE'), ('ra', 'm'), ('MI', 'M'), ('LI', 'II'), ('Li', 'N'), ('RI', 'M')] LibM19120101-V07-01-page12.txt: [('certif', 'ying'), ('ern', 'e')] LibM19120101-V07-01-page33.txt: [('Notwithstand', 'ing')] LibM19120101-V07-01-page43.txt: [('ESTAB', 'LISHMENT')] LibM19120101-V07-01-page49.txt: [('FA', 'IN'), ('TA', 'is'), ('SI', 'TA'), ('ci', 'T'), ('devel', 'opment')] LibM19120401-V07-02-page23.txt: [('misrepres', 'entation')] LibM19120401-V07-02-page5.txt: [('M.', '')] LibM19120701-V07-03-page13.txt: [('hol', 'iday')] LibM19120701-V07-03-page2.txt: [('Co', 'ercion')] LibM19120701-V07-03-page4.txt: [('CO', 'NG'), ('gi', 'e')] LibM19120701-V07-03-page5.txt: [('M.', '')] LibM19120701-V07-03-page52.txt: [('M.', '')] LibM19121001-V07-04-page19.txt: [('mul', 'titude'), ('proclama', 'tion')] LibM19121001-V07-04-page4.txt: [('gl', 'O'), ('ma', 'm'), ('MI', 'M')] LibM19121001-V07-04-page49.txt: [('gOR', 'E'), ('M.', '')] LibM19121001-V07-04-page6.txt: [('mo', 'i'), ('G.', '')] LibM19130101-V08-01-page1.txt: [('WA', 'tTS')] LibM19130101-V08-01-page2.txt: [('M.', ''), ('Ni', 'M')] LibM19130101-V08-01-page6.txt: [('LI', 'BERTY')] LibM19130401-V08-02-page1.txt: [('Lil', 'A')] LibM19130401-V08-02-page15.txt: [('re', 'pealed')] LibM19130401-V08-02-page24.txt: [('impor', 'tance')] LibM19130401-V08-02-page3.txt: [('CHOOS', 'ING')] LibM19130401-V08-02-page33.txt: [('STURDEVA', 'NT')] LibM19130401-V08-02-page34.txt: [('cer', 'O')] LibM19130401-V08-02-page4.txt: [('po', 'O')] LibM19130401-V08-02-page51.txt: [('denounci', 'ng'), ('JUSTI', 'FIES'), ('re', 't')] LibM19130401-V08-02-page7.txt: [('M.', '')] LibM19130701-V08-03-page2.txt: [('ti', 'e')] LibM19130701-V08-03-page3.txt: [('PRIN', 'CIPLES'), ('GREA', 'T'), ('MI', 'M'), ('MA', 'M')] LibM19130701-V08-03-page4.txt: [('XL', 'v')] LibM19130701-V08-03-page41.txt: [('re', 'I')] LibM19130701-V08-03-page44.txt: [('ce', 'e')] LibM19130701-V08-03-page48.txt: [('unlawf', 'ul')] LibM19130701-V08-03-page49.txt: [('eX', 't'), ('ADVER', 'TISED')] LibM19130701-V08-03-page51.txt: [('AL', 'MA')] LibM19131001-V08-04-page12.txt: [('yo', 'ng')] LibM19131001-V08-04-page27.txt: [('EXI', 'LE')] LibM19131001-V08-04-page28.txt: [('troub', 'ler')] LibM19131001-V08-04-page4.txt: [('ro', 'o')] LibM19131001-V08-04-page41.txt: [('ecclesi', 'astical')] LibM19131001-V08-04-page49.txt: [('ADVER', 'TISED')] LibM19131001-V08-04-page50.txt: [('re', 'No')] LibM19131001-V08-04-page52.txt: [('Ak', 'A')] LibM19131001-V08-04-page7.txt: [('M.', '')] LibM19140101-V09-01-page31.txt: [('mo', 'I')] LibM19140101-V09-01-page33.txt: [('RE', 'A')] LibM19140101-V09-01-page54.txt: [('ADVER', 'TISED')] LibM19140101-V09-01-page55.txt: [('EA', 'T'), ('CO', 'PY')] LibM19140101-V09-01-page56.txt: [('relig', 'ion'), ('Ti', 'e')] LibM19140401-V09-02-page11.txt: [('corporatio', 'n'), ('re', 'formation'), ('Congregatio', 'n')] LibM19140401-V09-02-page12.txt: [('un', 'der'), ('coun', 'try'), ('combina', 'tion'), ('al', 'I')] LibM19140401-V09-02-page13.txt: [('ti', 'nes')] LibM19140401-V09-02-page18.txt: [('ASSEM', 'BLY')] LibM19140401-V09-02-page25.txt: [('citi', 'zens')] LibM19140401-V09-02-page3.txt: [('CIRCULAT', 'ING')] LibM19140401-V09-02-page35.txt: [('PROTES', 'TANT')] LibM19140401-V09-02-page4.txt: [('Ki', 'Ng')] LibM19140401-V09-02-page41.txt: [('MAGA', 'ZINE')] LibM19140401-V09-02-page50.txt: [('M.', ''), ('ADVER', 'TISED')] LibM19140401-V09-02-page52.txt: [('re', 'ligious')] LibM19140401-V09-02-page7.txt: [('M.', '')] LibM19140701-V09-03-page2.txt: [('M.', '')] LibM19140701-V09-03-page20.txt: [('MC', 'ADOO'), ('FR', 'T')] LibM19140701-V09-03-page21.txt: [('MC', 'ADOO')] LibM19140701-V09-03-page34.txt: [('ambi', 'tion')] LibM19140701-V09-03-page4.txt: [('indi', 'tes')] LibM19140701-V09-03-page51.txt: [('ti', 'The')] LibM19140701-V09-03-page9.txt: [('sp', 'oken')] LibM19141001-V09-04-page1.txt: [('Sp', 'A')] LibM19141001-V09-04-page26.txt: [('TI', 'The')] LibM19141001-V09-04-page27.txt: [('Al', 'ES')] LibM19141001-V09-04-page3.txt: [('nI', 'M')] LibM19141001-V09-04-page31.txt: [('un', 'fearing'), ('AMERI', 'CANS')] LibM19141001-V09-04-page38.txt: [('es', 'tablish')] LibM19141001-V09-04-page4.txt: [('ro', 'O'), ('Ki', 'M')] LibM19141001-V09-04-page48.txt: [('Magaz', 'ine')] LibM19141001-V09-04-page49.txt: [('MI', 'r')] LibM19141001-V09-04-page50.txt: [('Mit', 'T'), ('li', 'M'), ('tE', 'E')] LibM19141001-V09-04-page51.txt: [('monarchi', 'cal')] LibM19141001-V09-04-page52.txt: [('Al', 'I')] LibM19141001-V09-04-page7.txt: [('M.', '')] LibM19150101-V10-01-page2.txt: [('pre', 'vent')] LibM19150101-V10-01-page3.txt: [('SUBSCRIP', 'TIONS')] LibM19150101-V10-01-page4.txt: [('Ki', 'M')] LibM19150101-V10-01-page51.txt: [('Ti', 'E')] LibM19150101-V10-01-page52.txt: [('MO', 'M'), ('Mi', 'M')] LibM19150101-V10-01-page53.txt: [('STIN', 'G')] LibM19150401-V10-02-page21.txt: [('RE', 'LIGIOUS')] LibM19150401-V10-02-page28.txt: [('impor', 'tant')] LibM19150401-V10-02-page3.txt: [('MI', 'M'), ('YA', 'M')] LibM19150401-V10-02-page40.txt: [('op', 'ening')] LibM19150401-V10-02-page42.txt: [('underg', 'o')] LibM19150401-V10-02-page46.txt: [('Re', 'formation')] LibM19150401-V10-02-page48.txt: [('sa', 'o')] LibM19150401-V10-02-page50.txt: [('M.', '')] LibM19150401-V10-02-page6.txt: [('ASSEMB', 'LE')] LibM19150701-V10-03-page11.txt: [('expec', 'tation')] LibM19150701-V10-03-page2.txt: [('Connecti', 'cut'), ('M.', '')] LibM19150701-V10-03-page28.txt: [('violenc', 'e')] LibM19150701-V10-03-page3.txt: [('Ki', 'M')] LibM19150701-V10-03-page35.txt: [('lif', 'e')] LibM19150701-V10-03-page38.txt: [('withou', 't')] LibM19150701-V10-03-page42.txt: [('M.', '')] LibM19150701-V10-03-page47.txt: [('po', 'se')] LibM19150701-V10-03-page48.txt: [('twenty-f', 'our')] LibM19151001-V10-04-page13.txt: [('politi', 'c')] LibM19151001-V10-04-page2.txt: [('af', 'filiated')] LibM19151001-V10-04-page22.txt: [('peo', 'ple')] LibM19151001-V10-04-page23.txt: [('destruc', 'tion')] LibM19151001-V10-04-page31.txt: [('Un', 'ion')] LibM19151001-V10-04-page48.txt: [('rO', 'O'), ('RE', 'C'), ('Ama', 'rillo')] LibM19151001-V10-04-page49.txt: [('RI', 'M')] LibM19151001-V10-04-page50.txt: [('Ri', 'M'), ('EM', 'F')] LibM19160101-V11-01-page12.txt: [('re', 'fused')] LibM19160101-V11-01-page26.txt: [('se', 'an')] LibM19160101-V11-01-page4.txt: [('M.', '')] LibM19160101-V11-01e-page1.txt: [('mi', 'A')] LibM19160101-V11-01e-page16.txt: [('ss', 'H')] LibM19160401-V11-02-page10.txt: [('OBSERV', 'ANCE')] LibM19160401-V11-02-page22.txt: [('re', 'ligious')] LibM19160401-V11-02-page26.txt: [('Mi', 'n')] LibM19160401-V11-02-page3.txt: [('MO', 'no')] LibM19160401-V11-02-page38.txt: [('Pontif', 'ex')] LibM19160401-V11-02-page4.txt: [('vis', 'ion'), ('teac', 'her')] LibM19160701-V11-03-page21.txt: [('diplom', 'a')] LibM19161001-V11-04-page29.txt: [('LI', 'BER')] LibM19161001-V11-04-page40.txt: [('LIBERT', 'Y')] LibM19170101-V12-01-page16.txt: [('pa', 'tient')] LibM19170101-V12-01-page2.txt: [('af', 'filiated')] LibM19170101-V12-01-page35.txt: [('AMMUN', 'ITION')] LibM19170101-V12-01-page5.txt: [('re', 'garded')] LibM19170701-V12-03-page1.txt: [('Lil', 'A')] LibM19170701-V12-03-page14.txt: [('un', 'Christian')] LibM19171001-V12-04-page10.txt: [('POR', 'TION')] LibM19171001-V12-04-page11.txt: [('suav', 'ity')] LibM19171001-V12-04-page23.txt: [('re', 'forming')] LibM19180101-V13-01-page24.txt: [('temperanc', 'e')] LibM19180101-V13-01-page35.txt: [('Th', 'e')] LibM19180401-V13-02-page31.txt: [('se', 'a')] LibM19180701-V13-03-page13.txt: [('FR', 'A')] LibM19181001-V13-04-page14.txt: [('LI', 'BER')] LibM19181001-V13-04-page32.txt: [('LIBERT', 'Y')] LibM19190101-V15-01-page14.txt: [('CONFESSIO', 'N')] LibM19190101-V15-01-page20.txt: [('pa', 'per')] LibM19190101-V15-01-page21.txt: [('prin', 'ciple')] LibM19190101-V15-01-page22.txt: [('un', 'Christian')] LibM19190101-V15-01-page5.txt: [('COUN', 'TRY')] LibM19190401-V15-02-page17.txt: [('RE', 'LIGIOUS')] LibM19190401-V15-02-page22.txt: [('Ca', 'sar'), ('sar', 'the')] LibM19190401-V15-02-page5.txt: [('vA', 'LE')] LibM19190701-V15-03-page2.txt: [('affil', 'iated')] LibM19190701-V15-03-page28.txt: [('peo', 'ple')] LibM19190701-V15-03-page32.txt: [('reli', 'gion')] LibM19190701-V15-03-page9.txt: [('religi', 'ous')] LibM19191001-V15-04-page1.txt: [('Lil', 'A')] LibM19191001-V15-04-page22.txt: [('Ma', 'Ma')] LibM19191001-V15-04-page23.txt: [('MI', 'NI')] LibM19191001-V15-04-page25.txt: [('Ma', 'Ms')] LibM19200101-V14-01-page2.txt: [('M.', ''), ('enfor', 'ce')] LibM19200401-V14-02-page13.txt: [('co', 'operation')] LibM19200401-V14-02-page14.txt: [('Sund', 'a'), ('co', 'operation')] LibM19200401-V14-02-page23.txt: [('se', 'a')] LibM19200401-V14-02-page35.txt: [('TI', 'THE')] LibM19200401-V14-02-page6.txt: [('LIBERT', 'Y')] LibM19200701-V14-03-page14.txt: [('AL', 'L')] LibM19200701-V14-03-page15.txt: [('M.', '')] LibM19200701-V14-03-page3.txt: [('Tir', 'A')] LibM19200701-V14-03-page8.txt: [('re', 'enacted')] LibM19201001-V14-04-page23.txt: [('UN', 'Christian'), ('Un', 'Scriptural')] LibM19201001-V14-04-page31.txt: [('wa', 'n')] LibM19201001-V14-04-page32.txt: [('SY', 'St')] LibM19201001-V14-04-page7.txt: [('gl', 'o'), ('Mayflo', 'wer')]
In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction6 Average verified rate: 0.9814180834781028 Average of error rates: 0.03366287600880411 Total token count: 1451734
In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[("'", 1499), ('m', 1326), ('d', 1254), ('e', 998), ('w', 956), ('t', 837), ('n', 776), ('r', 682), ('f', 633), ('g', 384), ('x', 271), ('u', 206), ('k', 192), ('tv', 150), ('th', 119), ('pa', 100), ('sunday-law', 92), ('z', 83), ('ex', 75), ('io', 72), ('re', 72), ('id', 71), ('co', 64), ('postmaster-general', 62), ('mo', 58), ('ga', 58), ('post-offices', 57), ('un-american', 57), ('statute-books', 56), ('va', 56), ('sunday-closing', 54), ('church-and-state', 49), ('un', 46), ('mm', 46), ('q', 44), ('mt', 42), ('attorney-general', 41), ('tion', 40), ('sunday-rest', 39), ('wm', 38), ('pp', 38), ('charta', 37), ('ro', 36), ('li', 36), ('neander', 31), ('-', 30), ('seventhday', 30), ('mi', 28), ('es', 28), ('ft', 28)]
Correction 7 -- Rejoin Split Words II¶
In [31]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction7"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
LibM19060401-V01-01-page11.txt: [('de', 'calogue')] LibM19060401-V01-01-page35.txt: [('en', 'th'), ('d', 'ak'), ('r', 'te')] LibM19060701-V01-02-page12.txt: [('LIB', 'ERTY')] LibM19061001-V01-03-page19.txt: [('m', 'es')] LibM19061001-V01-03-page22.txt: [('bane', 'ful')] LibM19070101-V02-01-page18.txt: [('IN', 'gress')] LibM19070101-V02-01-page21.txt: [('W', 'ILLIMANTIC')] LibM19070101-V02-01-page25.txt: [('r', 'esided')] LibM19070401-V02-02-page31.txt: [('wine', 'bibbers')] LibM19070401-V02-02-page36.txt: [('A', 'STI')] LibM19070701-V02-03-page18.txt: [('WILL', 'IAMS')] LibM19070701-V02-03-page8.txt: [('LIB', 'ERTY')] LibM19071001-V02-04-page15.txt: [('rem', 'arkable'), ('n', 'ation')] LibM19071001-V02-04-page32.txt: [('C', 'hr')] LibM19071001-V02-04-page51.txt: [('for', 'th'), ('r', 'ow')] LibM19080101-V03-01-page26.txt: [('PRES', 'IDENT')] LibM19080101-V03-01-page31.txt: [('Cab', 'inet')] LibM19080401-V03-02-page28.txt: [('Legis', 'lation')] LibM19080401-V03-02-page30.txt: [('p', 'rinciple')] LibM19080701-V03-03-page1.txt: [('ra', 'ff')] LibM19080701-V03-03-page39.txt: [('move', 'ment')] LibM19080701-V03-03-page43.txt: [('uni', 'versal')] LibM19080701-V03-03-page52.txt: [('THE', 'RE'), ('L', 'um')] LibM19081001-V03-04-page1.txt: [('V', 'oiD')] LibM19081001-V03-04-page15.txt: [('con', 'demned')] LibM19081001-V03-04-page38.txt: [('obj', 'ect')] LibM19090101-V04-01-page33.txt: [('estab', 'lish')] LibM19090101-V04-01-page52.txt: [('i', 'll')] LibM19090401-V04-02-page2.txt: [('f', 'ri')] LibM19090401-V04-02-page48.txt: [('fr', 'Ee')] LibM19090701-V04-03-page34.txt: [('CHRIS', 'TIAN')] LibM19090701-V04-03-page49.txt: [('APPEAR', 'ANCE')] LibM19091001-V04-04-page41.txt: [('kin', 'gdom')] LibM19091001-V04-04-page47.txt: [('APPEAR', 'ANCE')] LibM19100101-V05-01-page14.txt: [('WASH', 'INGTON')] LibM19100101-V05-01-page24.txt: [('per', 'se')] LibM19100101-V05-01-page31.txt: [('L', 'OTS')] LibM19100101-V05-01-page39.txt: [('L', 'OS')] LibM19100101-V05-01-page49.txt: [('W', 'ASHINGTON')] LibM19100401-V05-02-page23.txt: [('secre', 'tary')] LibM19100401-V05-02-page5.txt: [('LIB', 'ERTY')] LibM19100401-V05-02-page52.txt: [('C', 'HRISTIANITY'), ('Jan', 'uary')] LibM19100701-V05-03-page32.txt: [('the', 'reof')] LibM19100701-V05-03-page45.txt: [('ch', 'ose')] LibM19100701-V05-03-page49.txt: [('W', 'ASHINGTON')] LibM19101001-V05-04-page24.txt: [('MON', 'TREAL')] LibM19101001-V05-04-page49.txt: [('Romani', 'sm')] LibM19110401-V06-02-page18.txt: [('UNI', 'VERSITY')] LibM19110701-V06-03-page32.txt: [('sent', 'iments'), ('to', 're')] LibM19110701-V06-03-page37.txt: [('Chur', 'ch')] LibM19110701-V06-03-page45.txt: [('the', 're')] LibM19110701-V06-03-page50.txt: [('expe', 'rience')] LibM19111001-V06-04-page16.txt: [('AMER', 'ICA')] LibM19111001-V06-04-page52.txt: [('ec', 'clesiastical')] LibM19120101-V07-01-page49.txt: [('FA', 'ro'), ('devel', 'opment')] LibM19120101-V07-01-page50.txt: [('W', 'ASHINGTON')] LibM19120401-V07-02-page48.txt: [('LIB', 'ERTY')] LibM19120701-V07-03-page13.txt: [('hol', 'iday')] LibM19120701-V07-03-page15.txt: [('St', 'ates')] LibM19120701-V07-03-page2.txt: [('Co', 'ercion')] LibM19120701-V07-03-page26.txt: [('gov', 'ernment')] LibM19120701-V07-03-page38.txt: [('AMEND', 'MENTS')] LibM19120701-V07-03-page4.txt: [('e', 'riK')] LibM19120701-V07-03-page52.txt: [('A', 'VE'), ('t', 'ok'), ('N', 'Os')] LibM19121001-V07-04-page11.txt: [('c', 'ognition')] LibM19121001-V07-04-page29.txt: [('cit', 'ations')] LibM19121001-V07-04-page44.txt: [('Hank', 'ow')] LibM19121001-V07-04-page5.txt: [('R', 'EC')] LibM19121001-V07-04-page6.txt: [('a', 'Yr'), ('a', 'dm'), ('he', 'ft'), ('I', 'lai')] LibM19121001-V07-04-page8.txt: [('prop', 'osition')] LibM19130101-V08-01-page24.txt: [('LIB', 'ERTY')] LibM19130101-V08-01-page40.txt: [('state', 'ments')] LibM19130101-V08-01-page42.txt: [('a', 'nd')] LibM19130101-V08-01-page43.txt: [('im', 'prisonment')] LibM19130101-V08-01-page49.txt: [('T', 'ennessee')] LibM19130101-V08-01-page50.txt: [('Rev', 'ised'), ('and', 'Re')] LibM19130101-V08-01-page6.txt: [('I', 'NG')] LibM19130401-V08-02-page2.txt: [('Association', 'al')] LibM19130401-V08-02-page25.txt: [('des', 'ecration')] LibM19130401-V08-02-page30.txt: [('the', 're')] LibM19130401-V08-02-page50.txt: [('and', 'Re')] LibM19130701-V08-03-page2.txt: [('e', 'ta')] LibM19130701-V08-03-page42.txt: [('GOV', 'ERNMENT')] LibM19130701-V08-03-page51.txt: [('AL', 'MA')] LibM19131001-V08-04-page12.txt: [('yo', 'ng')] LibM19131001-V08-04-page41.txt: [('establish', 'ment')] LibM19140101-V09-01-page19.txt: [('IN', 'TERIOR')] LibM19140101-V09-01-page23.txt: [('govern', 'ment')] LibM19140101-V09-01-page31.txt: [('s', 'AO')] LibM19140101-V09-01-page38.txt: [('com', 'memoration')] LibM19140101-V09-01-page53.txt: [('Ar', 'ticles')] LibM19140101-V09-01-page56.txt: [('e', 're')] LibM19140401-V09-02-page11.txt: [('i', 'ons')] LibM19140401-V09-02-page12.txt: [('combina', 'tion')] LibM19140401-V09-02-page13.txt: [('ti', 'nes')] LibM19140401-V09-02-page25.txt: [('per', 'se')] LibM19140401-V09-02-page4.txt: [('M', 'UN'), ('g', 'EE'), ('to', 'RE'), ('M', 'EH')] LibM19140401-V09-02-page49.txt: [('e', 'th')] LibM19140401-V09-02-page52.txt: [('re', 'ligious')] LibM19140701-V09-03-page17.txt: [('and', 're')] LibM19140701-V09-03-page20.txt: [('I', 'ts')] LibM19140701-V09-03-page3.txt: [('or', 'zo')] LibM19140701-V09-03-page30.txt: [('A', 'pologete')] LibM19140701-V09-03-page34.txt: [('con', 'trary'), ('ambi', 'tion')] LibM19140701-V09-03-page4.txt: [('M', 'io'), ('indi', 'tes'), ('M', 'UT')] LibM19140701-V09-03-page49.txt: [('i', 'ke')] LibM19141001-V09-04-page13.txt: [('by', 're')] LibM19141001-V09-04-page29.txt: [('can', 'es')] LibM19141001-V09-04-page3.txt: [('of', 'tenest'), ('m', 'om'), ('or', 'zo')] LibM19141001-V09-04-page38.txt: [('es', 'tablish')] LibM19141001-V09-04-page4.txt: [('M', 'Eg')] LibM19141001-V09-04-page49.txt: [('i', 'nn'), ('I', 'ntr')] LibM19141001-V09-04-page50.txt: [('L', 'os')] LibM19150101-V10-01-page15.txt: [('con', 'sistency')] LibM19150101-V10-01-page21.txt: [('per', 'se')] LibM19150101-V10-01-page22.txt: [('per', 'se')] LibM19150101-V10-01-page3.txt: [('Y', 'ou'), ('or', 'zo')] LibM19150101-V10-01-page35.txt: [('Y', 'ork'), ('CRU', 'ISER')] LibM19150101-V10-01-page4.txt: [('M', 'Eg')] LibM19150101-V10-01-page53.txt: [('t', 'iro')] LibM19150401-V10-02-page14.txt: [('pam', 'phlets')] LibM19150401-V10-02-page23.txt: [('WASH', 'INGTON')] LibM19150401-V10-02-page25.txt: [('WASH', 'INGTON')] LibM19150401-V10-02-page28.txt: [('CAP', 'TIVE')] LibM19150401-V10-02-page3.txt: [('M', 'UN'), ('or', 'zo')] LibM19150401-V10-02-page46.txt: [('CALI', 'FORNIA')] LibM19150401-V10-02-page49.txt: [('HUN', 'DRED')] LibM19150701-V10-03-page19.txt: [('C', 'opyright')] LibM19150701-V10-03-page2.txt: [('Massa', 'chusetts'), ('Col', 'lege')] LibM19150701-V10-03-page3.txt: [('M', 'Eg'), ('illus', 'trated')] LibM19150701-V10-03-page4.txt: [('I', 'Ng')] LibM19151001-V10-04-page2.txt: [('af', 'filiated'), ('Col', 'lege')] LibM19151001-V10-04-page48.txt: [('C', 'UE'), ('O', 'RE'), ('Ama', 'rillo')] LibM19151001-V10-04-page49.txt: [('m', 'Es')] LibM19151001-V10-04-page51.txt: [('E', 'Li')] LibM19160101-V11-01-page12.txt: [('who', 're')] LibM19160101-V11-01-page26.txt: [('per', 'se')] LibM19160101-V11-01-page27.txt: [('per', 'se')] LibM19160101-V11-01-page28.txt: [('R', 'ighteousness')] LibM19160101-V11-01-page44.txt: [('r', 'ef')] LibM19160101-V11-01-page6.txt: [('KEN', 'TUCKY')] LibM19160101-V11-01-page7.txt: [('Calif', 'ornia')] LibM19160101-V11-01e-page11.txt: [('per', 'se')] LibM19160401-V11-02-page22.txt: [('mat', 'ters')] LibM19160401-V11-02-page4.txt: [('C', 'hr')] LibM19160401-V11-02-page48.txt: [('Mar', 'shal')] LibM19160701-V11-03-page14.txt: [('A', 'fter')] LibM19160701-V11-03-page15.txt: [('de', 'partment')] LibM19160701-V11-03-page34.txt: [('r', 'ea')] LibM19160701-V11-03-page39.txt: [('C', "esar's")] LibM19161001-V11-04-page15.txt: [('CHAR', 'ACTERISTIC')] LibM19161001-V11-04-page2.txt: [('inter', 'ests')] LibM19170101-V12-01-page2.txt: [('Ten', 'nessee'), ('af', 'filiated')] LibM19170101-V12-01-page21.txt: [('per', 'se')] LibM19170101-V12-01-page3.txt: [('Sab', 'batarians')] LibM19170101-V12-01-page30.txt: [('pro', 'hibit')] LibM19170101-V12-01-page6.txt: [('and', 're')] LibM19170701-V12-03-page4.txt: [('I', 'RE')] LibM19170701-V12-03-page9.txt: [('f', 'undamentals')] LibM19171001-V12-04-page10.txt: [('CON', 'SUMED')] LibM19171001-V12-04-page11.txt: [('suav', 'ity')] LibM19171001-V12-04-page19.txt: [('ha', 're')] LibM19171001-V12-04-page30.txt: [('gen', 'eral'), ('S', 'HUTE')] LibM19180101-V13-01-page11.txt: [('intro', 'duced')] LibM19180101-V13-01-page12.txt: [('C', "esar's")] LibM19180101-V13-01-page17.txt: [('deter', 'Mination')] LibM19180101-V13-01-page24.txt: [('a', 'nd')] LibM19180101-V13-01-page31.txt: [('a', 're')] LibM19180401-V13-02-page20.txt: [('or', 'dained')] LibM19180401-V13-02-page24.txt: [('MASSA', 'CHUSETTS')] LibM19180701-V13-03-page16.txt: [('THE', 'TA')] LibM19180701-V13-03-page19.txt: [('to', 'ut')] LibM19181001-V13-04-page11.txt: [('per', 'se')] LibM19181001-V13-04-page20.txt: [('peril', 'ous')] LibM19190101-V15-01-page21.txt: [('per', 'se')] LibM19190401-V15-02-page22.txt: [('to', 'Ca')] LibM19190401-V15-02-page23.txt: [('le', 'Fevre')] LibM19190701-V15-03-page18.txt: [('minor', 'ity')] LibM19190701-V15-03-page2.txt: [('Of', 'fice'), ('affil', 'iated')] LibM19190701-V15-03-page31.txt: [('per', 'se')] LibM19190701-V15-03-page34.txt: [('in', 'struction')] LibM19190701-V15-03-page6.txt: [('As', 'sn')] LibM19191001-V15-04-page17.txt: [('bap', 'tism')] LibM19191001-V15-04-page21.txt: [('O', 'NE')] LibM19191001-V15-04-page24.txt: [('gov', 'ernment')] LibM19200101-V14-01-page5.txt: [('a', 'nd')] LibM19200101-V14-01-page6.txt: [('con', 'cerning')] LibM19200401-V14-02-page11.txt: [('W', 'ashington')] LibM19200401-V14-02-page22.txt: [('per', 'se')] LibM19200401-V14-02-page23.txt: [('per', 'se')] LibM19200401-V14-02-page27.txt: [('pro', 'tection')] LibM19200401-V14-02-page36.txt: [('z', 'oo')] LibM19200701-V14-03-page19.txt: [('ques', 'tions')] LibM19200701-V14-03-page25.txt: [('per', 'se')] LibM19200701-V14-03-page36.txt: [('t', 'itI')]
In [32]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction7 Average verified rate: 0.9815507052480597 Average of error rates: 0.033392883345561265 Total token count: 1451546
In [33]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[33]:
[("'", 1499), ('m', 1314), ('d', 1253), ('e', 997), ('w', 951), ('t', 835), ('n', 774), ('r', 677), ('f', 631), ('g', 383), ('x', 271), ('u', 206), ('k', 192), ('tv', 150), ('th', 117), ('pa', 100), ('sunday-law', 92), ('z', 82), ('ex', 75), ('io', 71), ('id', 71), ('co', 64), ('postmaster-general', 62), ('re', 59), ('mo', 58), ('ga', 58), ('post-offices', 57), ('un-american', 57), ('statute-books', 56), ('va', 56), ('sunday-closing', 54), ('church-and-state', 49), ('mm', 46), ('q', 44), ('un', 43), ('mt', 42), ('attorney-general', 41), ('tion', 40), ('sunday-rest', 39), ('wm', 38), ('pp', 38), ('charta', 37), ('ro', 35), ('li', 35), ('neander', 31), ('-', 30), ('seventhday', 30), ('mi', 28), ('ky', 28), ('religio-political', 27)]
Correction 8 -- Remove Long Tokens with Repeating "m"s¶
In [34]:
reports.long_errors(errors_summary, min_length=17)[:50]
Out[34]:
(['countermemorialists', 'immumnitommuummunitimmtwuntnimmummiona', 'antiprohibitionists', 'mmierriotitimmiiembitimiimerimiim', 'vuaziffiemunimeluitennotinutnnifin', 'nrmomoommomomrsoommommokmagmkwon', 'iiiwtierttititiiiit', 'iiimiumiummummimmominimmimmimmimmihmimmiimminimummummimmumummemimmimmimm', 'church-and-stateunion', 'iiiirreriiitlhinifid', 'simmmismwklaiigitil', 'mimmimmummiiimmimii', 'iiiiiiiiiiiliiiiiiii', 'rwiumwimmiiiiimimmumnii', 'mmmmmmmmmmmmmmmmmns', 'pilurprmarasigimmt', 'preventivejurisdiction', 'miilmilliiimilliifilmidid', 'enosnantiemotainotientetiemtio', 'mmozmrommomommonorummanoz', 'humilffilitiffinummiffiiimminlimmummiiiiiiiiimm', 'xramoxmozramommommocmommmommx', 'seventh-day-observing', 'yffinsmemmmmmmmwmswmmmmmmmnim', 'ititeiltintonecfctration', 'migininaugimmikimmu', 'latitteilommtwtfifolror', 'mvstimpsmgrecuttliv', 'iullnnunulnmmuumnuluunnunuumlt', 'mgraotrtraccommozraglgraccommicami', 'aommommemsatammogarmaxsorarmwelimmelinuilmenompommixliniewtlominermiimmurpimumnuommurm', 'iiiiiiiiiiimmiumulinuilmilne', 'better-established', 'rnomommonoszuzummummanmmollommom', 'nemmiwiiiimortrinl', 'ffiummummiummunnummumummmumumummummunamummunummuumummmunummunnummumummumnitumnims', 'counter-petitioners', 'imummintommumnimminumummmummlimmunumummummmunumumminutimmummmitimumnimmm', 'inimlfilninninilli', 'mmmmmmmmermmmmmmmmmmmmmmm', 'gawavaiaaamminonwirit', 'xxxxxxxxxxmocxxxxxxxx', 'miiiiiniiiiiiimilimiiiiiiiiiiimiliiiiiimmionimmiumingiiiiiiiiiiiimmiliiiimmomiiiiminwiliiiiiiiiiiiiiiiiiinminsummuilimiliiinimonnimmiiiiiiiiiiiiiiiiiiimiimiq', 'wimiimilliiiiiiiiiiiiimmimmithiiiiimmumminunifiniiiii', 'unimilismimitimittnismitimmimittlimummumitemitimmummmintimmimiumiumnitimllminiummuntiummilmi', 'anumilimminiumminnimminumminnummiliniummiliml', 'affindlitilffilillikvillehd', 'mmmmmmmmmuimmmmmmrimmmmmmmmmmmmmmm', 'muminatatimiumumuutumitimmittimmummminnumminumuffiummumummunnomiminummuummummimmumnini', 'rsomravramcmotrammragmonommxmommansom', 'mmmmmmffirimmmmmmmmmmmmemmmmmmm', 'antiecclesiastical', 'penmenisrisdinaorabsesiceewer', 'ractimiriiiiitiriiltililietcliteiviivittiiiitievalifittia', 'politico-religious', 'niviitiesialiffiliifiiilrimlnii', 'unemeeeeeeneeleeneeetelli', 'vaaffisl-co-pacific', 'lllllllllllllllllllllllllllllllllllllllllllllllllllllllllll', "mmmosmermsimmmemmnm'iligh", 'mmmmmnemmommmnmmmme', 'religion-and-state', 'ifaimitialiumuumnimimmtmimummuimmunimiummitinimminimmumminummumunnommumminumninummunim', 'iiiiiiiiiiiiiiiiiiiiiii', 'mgimmmmmmmmmmmmmmmmmmmmmmmmmmmm', 'msossgmaiaassmgeamakawmalnarlaa', 'lecosniiionpainoticsovicesfirde', "linunimmimrs'inumumu", 'rrrprrrrrrrritrrrf', 'lamjukgmdavagixiatm', 'toforeigncountries', 'wmiwimiiiiiiiiiiiiiiimmiumm', 'tixtreciremyemiresnirtiortiorrioritortiorrii', 'democraticrepublican', 'hihinhiniiiiiiirin', 'two-and-a-halfmile', 'commander-in-chief', 'self-determination', 'mmimumumwwwiiiiiiiiiiiiiiiiiiiiiiiiiiiiilleeleteeemememme', 'nmmmommrsonomrznemonmonomnrmotruomonom', 'emmonmenommomumommommommonotrnommirnmn', 'iiiiiiilliniemniiiiii', 'tsereanctosrothciertny', 'snlrnuurinunuununa', 'rimareinsmiummisimememesiermem', 'mcommommommuommommonomm', 'tiarezemieeleismikiimeeemiewew', 'emerhilsamalsinalso', 'pimumwmummuniumummtimmtunit', 'burckhardt-schatzmann', 'constitution-makers', 'limmiiiiimiiiiiiiiiiiiiiiiiiiiiiiiiiiiie', 'hummmtimmmummummummore', 'iiiiiiiiiiiiiiiiiii', 'feemowiwiedimeiersig', 'one-day-rest-inseven', 'maimimiummaimmismilinuminutimmuminiumilmmitimmummumwmoimminummiumnimmititilowinitimiiiti', 'moerlrlreemoinmemmommmommikumoe', 'mmmmmmmmmmmmmmmmmmmmmm', 'postmaster-general', 'establishingreligious', 'one-day-rest-in-seven', 'emelieniwionsavibannotisloneemite', 'vice-president-elect', 'faipmkrivmriiyamkrkilsriiirrrriiiirrrrisikv', 'rilifininniimummaimumeiminiiiiiiiniiiiiiiiiiiiiiiiiimmpumummhimumwommiimmiimmiiiiimmimmimmimiumnimimmim', 'e-illmllommimilimmilummumenimilmnimuningumminumiiiiimilmimmunimifinnilionontimmigimiliiimiffiliffilimiliiiiiiiiiiiiimm', 'unummonummtummunialliimumiir', 'hiiiiiiiiiiiiiiiii', 'wimmummtmmuntifiummiummiummmommumwffimmiummummummulmtmminammmmunnummmumummummumummr', 'smmusissommummusismussmimussissusissimmimmiiiiiiium', 'hiimiiiiiiiiiiiiiiiiniiumiuminimui', 'iiiimmummiatumbiiiiimbimummiiiiiiimmimm', 'xxxxxxxxxxxxxxxxxx', 'ramtersimrammemarkirracarmermartm', 'mmipoinnonfoemnnioannim', 'nomenegvoicedienast', 'inoomalloisossimis', 'nmommumammammunnumumuum', 'mozmnmwommolzemrammonommommommommn', 'iillrieeiaiiirriardi', 'agaomoorwairalioigtiargial', 'lmiiiiiiiiiiiiiiiiiii', 'ihilibillilltreterita', 'secretary-ofthe-interior', 'conscience-fettered', 'muummmunnummonmummumuummmunimmupm', 'campbell-bannerman', 'impreeloreesocoeselaal', 'ffassininsonsiwoloolgasers', 'unnnnnnunnmamnumununnmmunmniiimm', 'non-sunday-observing', 'piihnummuumbhimurunimenhomuummununimminhhohuminumuunummunnuhhhimminbui', 'mheminuffinfillffilimis', 'alliallallialliallaillassiiiiiiiiiiimiiiiiiiiiiiiim', 'mmmmeimmimmmmmmmmmmmmmmttmmmmmtim', 'inforfaisiomomincomocadoviemmigoimiwa', 'smossmunssunommummusnmussmssmissussmsmussmmssmissmossmussussummmmusstmosssmsmssmnnsmimmumsmimmwsrmossumms', 'sssssssssssssssssssss', 'mmiiiimumhimimmiiiiimm', 'statesman-preacher', 'inter-denominational', 'nosonmomorwemcwaint', 'reconstructionists', 'mmommmommommommmom', 'mnrummommommoncommommmown', 'nfiemmeemmeemmmeeeeeeeeeeeemeeem', 'ipuitnilinimilliiiinulillluunii', 'ehmmmmmmmmmmmmmmmmmmmmmmmmmmmmm', 'mmmmommmmmmmwmnirimemmmmmm', 'wamegkimnmrummmmesemvmmmrmk', 'nininimummujimininlini', 'lieutenantgovernor', 'self-glorification', 'jiuwuuwnnwumllonllllllllhihiiuiiuihul', 'ommumniummunuimiumuutimutimmulummimmiummintomunmumumummumumumnomminuninumninummumumummtuntiummirt', 'self-aggrandizement', 'mummmummimmimmimmimmiiiimmiiiimmiummimmiimimmimmimmiihimmiimmi', 'religious-sabbatic', 'iitoitllislossoliiosill', 'intheszealwarfejrrnicenathemoatiry', 'religio-constitutional', 'iiiiiiiiiiiiiiiiiiii', 'personal-heart-conversion', 'much-to-be-desired', 'netlftrrmmidhimizmmommommilvmm', 'curiosity-gratifying', 'ummuummutmummuummmiummummummummumumminummonummunummmummuummuuttimmumut', 'immuunumummummmtuummummiumunumtumffimmmutummunmuu', 'mconslfaitmeegtifo', 'monmmaimmenimmmmmmmemm', 'itmlinillitiniiimmullimitilittiminunitiffitiminimmituniumnitmitilistimmilimutiiiiiimitimitintiumnimmummitm', 'emsmwmmmwmmmnmhoneni', 'eimmiumiiiiiiimmumummiiimumillimimminimumaniumiffiffimmiummuumniimmommumiummlinmmiumullimmi', 'selfaggrandizement', 'flummimmumommifiumwmffimmumnimmimummlimmumimmmunlimmmummmumuummumummlimuummumumumung', 'faimmeigegrommegfa', 'satisfactostruction', 'miommooomoomsoicimuchmusuoihiuoimisiummicosississinasseeememeescs', 'monommomozragrammxragnm', 'pecsetemmeltigazolom', 'ssumsffismssumusummummtmussessumnsumussunstsmossmossmwsussmumunnmunsummossumsnwssumminimmsnintminimmusmussinissunues', 'trgatimedimegoovemotwo', 'iiimillilintirnimmimmiiiiiiiiiiiiiiitiiiiiiiiiiiiiffilii', 'animmiimmimmiimmiimimmiwimmumwhimommommiimmihmummimiummimmemiimmummummenumiiiimmuumminummis', "attorney-general's", 'rimmineiiiiiiiiimirre', 'sunday-enforcement', 'momeoecimmoimommomommoiximm', 'nitroenrtenaddlimeg', 'mititayerwiriiiinicrierier', 'twenty-four-hour-day', 'atssussusumoususissonclaciiimmiimmisiscommissi', 'ragmmmmmmmmmmmmmmmmmmmmimmm', 'mmmmmmmmmmmmmmmmme', 'xectimmecemommommiimommommomme', 'mmohcomemmaragraanilmmmohm', 'iiiiiiiinillitiiii', 'criiitriatoyearetriarmireirntrecltwieviretriarctieanyaremiractmiteetreowehatio', 'lllllllllllllllllllllllllll', 'state-and-religion', 'compulsory-sunday-law', 'iiiiiiiiiiiiiiiiiiiiiiiiiii', 'hmhimimmiiiiiimmihmiimumm', 'iiiiiiiiiiiiiiiiiiiii', 'mouaamaaammmaaaaaaaaamamanmmammmammimaaaaaamaaaaaammiaaaaaa', 'illlllulllllllllllllllllllllllllllll', 'iwiiiiiiiiiiiiiiiiii', 'associate-justices', 'mmumiimmumiummimmiimmiiimimmiwimmumbiumummuimmiimmwimummummirmiumie', 'consaalermtooldlny', 'counterdenunciations', 'wralrammimmrzrznomnommgmmonom', 'iiiiiiiimmiiiiiiiiiiiiimmium', 'iimmiimmummuimmimummwimummmimmimimmiummmummuminwimbhmmimmiliniffillinnuffiffill', 'obviouslyagreement', 'one-dayof-rest-in-seven', 'mezmommommonommomommommommommmom', 'rrigtreatiariiirriiriiiiriiiitrivittioriiilrrictiiilriiiitii', 'mmommommimommotimmotmm', 'history-confirming', 'semi-ecclesiastical', 'lffiffimffithimmouninoffimmommummuommunimmonwiniiiiminnumumminriumminlimminiiiiiiiiiiimmonimum', 'tomplonsesolomerol', 'mmmmmmnswmmesimmornmmmmmm', 'intelligent-looking', 'ivosengtoexirmemed', 'suspension-bridges', 'self-righteousness', 'miummuiiiiiiiiiimmummmiiiiihimmmummimmummimmimi', 'mmemmmmmmmmmmeemmmemmmmmmmmmm', 'mmmnwnsommmmmmmmrmmmolm', 'iumitimumummumunnumaintimmummumumiumummumummtunmitimuumminnimuummuummumminumismiumnimmuntimmmuthw', 'five-million-dollar', 'esnmemmmmmmrimmmmeenmmemmm', 'politico-ecclesiastical', 'counterallegations', 'meenmenmmmmmmmmmawknmmmmmeg', 'mmmmmmmmmmmmmmmmmmmx', 'alaska-yukon-pacific', 'ecemoictiemememoodemeeeme', 'commandment-keepers', 'trothofabusesandegurpatent', 'trading-with-the-enemy', 'go-to-churchor-stay-indoors', 'bureau-of-military-intelligence', 'mimmutinsimiunimminimmummusilinnimmimuminnumminnimmummilinuffisliinummimilmilimitimiumminniiniiitimitimmimmilimititinnum', 'seventeenth-century', 'iitiiiitiwiiiititivaititignifittaiiiitqawilitilitit', 'mmniummiunrimiiiiiiiilfmfiinotoiiimun', 'ussosiiimiwohmiiiiiiiismisoisisiiiiminallioisoisoisoisosiososi', 'mrnmimmmm-rimmmmmmoamrimmmmm', 'prescott-wilson-tumulty', 'nifilnimifintilllillflillifilnifilmiummiiffillfill', 'inimiiiiiminunimmilimumusinominimuninimmilmr', 'emirmeilsaarsinemiliehmee', 'rograssmargmeermirl', 'ttimilimmumulminnittinitintinninitutimmi', 'frankfort-on-the-main', 'glilihiliiiiiiraiii', 'religious-legislation', 'intfilnilhimirimihimmihimiirminlnimimiriminiimium', 'minimmiiiiiiiiiimihwohimulla', 'one-day-of-rest-in-seven', 'uvrapsimisulswipampiampv', 'lieutenant-colonel', 'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii', 'yhmommomownwmmmmmmm', 'half-pintof-claret', 'mutummimmomminumummummumunimmiumummummlimummmumumunummtimummimintowitmmummrx', 'immegmmmmmmmmmmmmmmmmmr', 'mmmeammmmmmmmmmmmmmmmmmmemmmmm', 'parochial-school-system', 'ztkirmintzflrmerifranc', 'mramiluesimairrimamesiemiamemilie', 'emimmiummehmemeimmimminimmummminimmeminimmumummemminiummunieli', 'agretiitilitltitstriffigtisifitiveram', 'wommiumniffunivirlsoir', 'maher-shalal-hash-baz', 'xxxxxxxxxxxxxxxxxxxxxx', 'xxxxxxxxxxxxxxxxxxxxx', 'ossionosollsomasismisiiiiiisiiimissimisomallaallallaillaffluss', 'mommummmufflummunmuummmutommummmuummmumumummummumunummummuunmuumwo', 'governmentsupported', 'immmotzmotatmtmommzum', 'mmommomeommmmozmommotrmmgramopagr', 'heaven-enlightened', "postmaster-general's", 'mmgrommmmommgrmommmoromrmonorz', 'succeedinggenerations', 'imememememeinimeimii', 'siiiiiiilitaiiiiiiiiiaisill', 'self-contradictory', 'ostammosanosonsorr', 'rnitivittiltifirmi', 'muniummmitimlinini', 'ormucesemmommannumorammosimemaamoutammovomnumeammnommukumumonmustormmummunno', 'counterdemonstrations', 'i-ifidairicliiiriiirroi', 'thefactthattheyinvolvethevitalprinciple', 'inmpaiavimmipamipammmiximp', 'demonstrainfluence', 'nunnnunnuuuuuuuuuuunnuuuuuuuuuunnnuuunmuuuumusuuuuunuuuuuuuuuuuuuuuununnnnnunuuuuunuuuuuuuuunuwuum', 'immumimilimitmliminiiimiiiiiiiiiiiiiiiiiiiimmiiiiiiiiiiimmintmill', 'milliummiumunmionwimmimmiumr', 'weiverreitaararforreahaarivitoroyerriiivii', 'vriliriiifiertailitarectrinfeltriatiatictitlifie', 'iiimuumimiiiiiimhomidfinnlinlinnflunnhohhohimimhhommilinlinflo', 'maher-shalalhash-baz', 'jerusalem-to-jericho', 'church-and-state-union', 'mimmomiosomosoissoisioissossosivissossicsiiiss', 'hriimmiiiiimeimiiimihni'], 17)
Remove long tokens with long strings of "m", "i", "l", "x"
In [35]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = "correction7"
cycle = "correction8"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
replacements = []
replacements.append(clean.check_for_repeating_characters(tokens, "m"))
replacements.append(clean.check_for_repeating_characters(tokens, "M"))
replacements.append(clean.check_for_repeating_characters(tokens, "i"))
replacements.append(clean.check_for_repeating_characters(tokens, "I"))
replacements.append(clean.check_for_repeating_characters(tokens, "l"))
replacements.append(clean.check_for_repeating_characters(tokens, "x"))
replacements.append(clean.check_for_repeating_characters(tokens, "X"))
replacements.append(clean.check_for_repeating_characters(tokens, "u"))
replacements.append(clean.check_for_repeating_characters(tokens, "n"))
replacements = [item for sublist in replacements for item in sublist]
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
LibM19080101-V03-01-page1.txt: [('wimmummtmmuntifiummiummiummmommumwffimmiummummummulmtmminammmmunnummmumummummumummr', ' '), ('JiuWuuWnnWumllonllllllllhIHIIUIIUIHUL', ' ')] LibM19080401-V03-02-page1.txt: [('ifaimitialiumuumnimimmtmimummuimmunimiummitinimminimmumminummumunnommumminumninummunim', ' '), ('mutummimmomminumummummumunimmiumummummlimummmumumunummtimummimintowitmmummrx', ' '), ('hummmtimmmummummummore', ' '), ('lllllllllllllllllllllllllll', ' '), ('lllllllllllllllll', ' ')] LibM19080701-V03-03-page1.txt: [('muminatatimiumumuutumitimmittimmummminnumminumuffiummumummunnomiminummuummummimmumnini', ' '), ('Eimmiumiiiiiiimmumummiiimumillimimminimumaniumiffiffimmiummuumniimmommumiummlinmmiumullimmi', ' '), ('MMIMMIMMIMUMMIP', ' '), ('Eimmiumiiiiiiimmumummiiimumillimimminimumaniumiffiffimmiummuumniimmommumiummlinmmiumullimmi', ' ')] LibM19081001-V03-04-page1.txt: [('flummimmumommifiumwmffimmumnimmimummlimmumimmmunlimmmummmumuummumummlimuummumumumung', ' '), ('MMUMIIMMUMIUMMIMMIIMMIIIMIMMIWIMMUMBIUMUMMUIMMIIMMWIMUMMUMMIRMIUMIE', ' '), ('HumilffilitiffinumMIffiiimminlIMMUMMIIIIIIIIIMM', ' '), ('MMUMIIMMUMIUMMIMMIIMMIIIMIMMIWIMMUMBIUMUMMUIMMIIMMWIMUMMUMMIRMIUMIE', ' '), ('UMMIIIIIIMIIIIIM', ' ')] LibM19090101-V04-01-page1.txt: [('iumitimumummumunnumaintimmummumumiumummumummtunmitimuumminnimuummuummumminumismiumnimmuntimmmuthw.', ' '), ('iumitimumummumunnumaintimmummumumiumummumummtunmitimuumminnimuummuummumminumismiumnimmuntimmmuthw.', ' ')] LibM19090401-V04-02-page1.txt: [('HMHIMIMMIIIIIIMMIHMIIMUMM', ' '), ('PIIHNUMMUUMBHIMURUNIMENHOMUUMMUNUNIMMINHHOHUMINUMUUNUMMUNNUHHHIMMINBUI', ' '), ('HMHIMIMMIIIIIIMMIHMIIMUMM', ' '), ('unnnnnnunnmamnumununnmMunmNIIIMM', ' ')] LibM19090701-V04-03-page1.txt: [('ummuummutmummuummmiummummummummumumminummonummunummmummuummuuttimmumut', ' '), ('MIUMMUIIIIIIIIIIMMUMMMIIIIIHIMMMUMMIMMUMMIMMIMI', ' '), ('MIUMMUIIIIIIIIIIMMUMMMIIIIIHIMMMUMMIMMUMMIMMIMI', ' '), ('lllllllllllllllllllllllllllllllllllllllllllllllllllllllllll', ' '), ('ummuummutmummuummmiummummummummumumminummonummunummmummuummuuttimmumut', ' ')] LibM19100101-V05-01-page1.txt: [('nmommumammammunnumumuuM', ' '), ('MUMMMUMMIMMIMMIMMIMMIIIIMMIIIIMMIUMMIMMIIMIMMIMMIMMIIHIMMIIMMI', ' '), ('rilifininniiMUMMAIMUMEIMiniiiiiiiniiiiiiiiiiiiiiiiiiMMPUMUMMHIMUMWOMMIIMMIIMMIIIIIMMIMMIMMIMIUMNIMIMMIM', ' '), ('rilifininniiMUMMAIMUMEIMiniiiiiiiniiiiiiiiiiiiiiiiiiMMPUMUMMHIMUMWOMMIIMMIIMMIIIIIMMIMMIMMIMIUMNIMIMMIM', ' '), ('MUMMMUMMIMMIMMIMMIMMIIIIMMIIIIMMIUMMIMMIIMIMMIMMIMMIIHIMMIIMMI', ' '), ('rilifininniiMUMMAIMUMEIMiniiiiiiiniiiiiiiiiiiiiiiiiiMMPUMUMMHIMUMWOMMIIMMIIMMIIIIIMMIMMIMMIMIUMNIMIMMIM', ' '), ('lliilligijnirMli"Illj', ' ')] LibM19100401-V05-02-page1.txt: [('unummonummtummunialliimumiir', ' '), ('ommumniummunuimiumuutimutimmulummimmiummintomunmumumummumumumnomminuninumninummumumummtuntiummirt', ' '), ('WIMIIMilliiiiiiiiiiiiimmimmithiiiiimmumminunifiniiiii', ' '), ('mommummmufflummunmuummmutommummmuummmumumummummumunummummuunmuumwo', ' '), ('MMIIIIMUMHIMIMMIIIIIMM', ' '), ('iiiimmummiatUMBIIIIIMBIMUMMIIIIIIIMMIMM', ' '), ('WIMIIMilliiiiiiiiiiiiimmimmithiiiiimmumminunifiniiiii', ' '), ('iiiimmummiatUMBIIIIIMBIMUMMIIIIIIIMMIMM', ' '), ('mommummmufflummunmuummmutommummmuummmumumummummumunummummuunmuumwo', ' ')] LibM19100701-V05-03-page1.txt: [('imummintommumnimminumummmummlimmunumummummmunumumminutimmummmitimumnimmm', ' '), ('immuunumummummmtuummummiumunumtumffimmmutummunmuu', ' '), ('immuunumummummmtuummummiumunumtumffimmmutummunmuu', ' ')] LibM19101001-V05-04-page1.txt: [('MIMMIMMUMMIIIMMIMII', ' '), ('ANIMMIIMMIMMIIMMIIMIMMIWIMMUMWHIMOMMOMMIIMMIHMUMMIMIUMMIMMEMIIMMUMMUMMENUMIIIIMMUUMMINUMMIS', ' '), ('EMIMMIUMMEHMEMEIMMIMMINIMMUMMMINIMMEMINIMMUMUMMEMMINIUMMUNIEli', ' '), ('IIIMIUMIUMMUMMIMMOMINIMMIMMIMMIMMIHMIMMIIMMINIMUMMUMMIMMUMUMMEMIMMIMMIMM', ' '), ('ANIMMIIMMIMMIIMMIIMIMMIWIMMUMWHIMOMMOMMIIMMIHMUMMIMIUMMIMMEMIIMMUMMUMMENUMIIIIMMUUMMINUMMIS', ' '), ('MINIMMIIIIIIIIIIMIHWOHIMUlla', ' '), ('liMMIIIIIMIIIIIIIIIIIIIIIIIIIIIIIIIIIIIE', ' ')] LibM19110101-V06-01-page1.txt: [('...ffiummummiummunnummumummmumumummummunamummunummuumummmunummunnummumummumnitumnims', ' '), ('muummmunnummonmummumuummmunimmuPm.n', ' '), ('IIMMIIMMUMMUIMMIMUMMWIMUMMMIMMIMIMMIUMMMUMMUMINWIMBHMMIMMIliniffillinnUffiffill', ' '), ('mmniummiunrimiiiiiiiilfmfiinotoiiimun', ' '), ('WMIWIMIIIIIIIIIIIIIIIMMIUMM', ' '), ('iullnnunulnmmuumnuluunnunuumlt', ' ')] LibM19110401-V06-02-page48.txt: [('MMMMMIMIMMMMMMMMM', ' '), ('MMMMMMMMMMMMMMMMME', ' '), ('IMMEgMMMMMMMMMMMMMMMMMR', ' '), ('MMMMMMMMMIMMM', ' ')] LibM19110401-V06-02-page49.txt: [('xxxxxxxxxxxxxxxxxxxxxx', ' '), ('XXXXXXXXXXXXXXXXXXXXX', ' ')] LibM19110701-V06-03-page1.txt: [('pimumwmummuniumummtimmtunit', ' '), ('Milliummiumunmionwimmimmiumr', ' '), ('iiimillilintirniMMIMMIIIIIiiiiiiiiiitiiiiiiiiiiiiiffilii', ' '), ('IIIIIIIIMMIIIIIIIIIIIIIMMIUM', ' ')] LibM19110701-V06-03-page48.txt: [('FAIPMKRIVMRIIYAMKRKILSRIIIRRRRIIIIRRRRISIKV', ' ')] LibM19110701-V06-03-page49.txt: [('xxxxxxxxxxmocxxxxxxxx', ' '), ('XXXXXXXXXXXXXXXXX', ' '), ('XXXXXXXXXXXXXX', ' ')] LibM19111001-V06-04-page49.txt: [('XXXXXXXXXXXXXXXX', ' '), ('XXXXXXXXXXXXXXXXXXXXXX', ' '), ('XXXXXXXXXXXXXXXXXX', ' ')] LibM19120401-V07-02-page49.txt: [('mezmommommonommomommommommommmom', ' ')] LibM19120401-V07-02-page50.txt: [('mnrummommommoncommommmown.mo', ' '), ('mmgrommmmommgrmommmoromrmonorz.', ' ')] LibM19120401-V07-02-page51.txt: [('nrmomoommomomrsoommommokmagmkwon.', ' ')] LibM19120701-V07-03-page49.txt: [('Emmonmenommomumommommommonotrnommirnmn', ' ')] LibM19120701-V07-03-page50.txt: [('XraMOXMOZraMOMMOMMOCMOMMMOMMX', ' '), ('MMOMMMOMMOMMOMMMOM', ' ')] LibM19120701-V07-03-page51.txt: [('rsomravramcmotrammragmonommxmommansom', ' '), ('mozmnmwommolzemrammonommommommommn', ' '), ('MOMa.netlftrrMMIDHIMIZMMOMMOMMILVMM', ' ')] LibM19121001-V07-04-page3.txt: [('MMOMMOMMIMOMMOTIMMOTMM', ' ')] LibM19130101-V08-01-page3.txt: [('MMOMMOMEOMMMMOZMOMMOtrMMgraMOPagr', ' ')] LibM19130101-V08-01-page4.txt: [('ragMMMMMMMMMMMMMMMMMMMMIMMM', ' ')] LibM19130401-V08-02-page3.txt: [('MOMEOECIMMOIMOMMOMOMMOIXIMM', ' ')] LibM19130401-V08-02-page4.txt: [('wralrammimmrzrznomnommgmmonom', ' ')] LibM19130401-V08-02-page52.txt: [('mmozmrommomommonorummanoz', ' ')] LibM19130701-V08-03-page3.txt: [('MMOHCOMEMMAragraanilMMMOHM', ' ')] LibM19140101-V09-01-page3.txt: [('XECTIMMECEMOMMOMMIIMOMMOMMOMME', ' ')] LibM19140101-V09-01-page4.txt: [('IMMEMMOMMOMMEMN', ' ')] LibM19140401-V09-02-page3.txt: [('MMMEAMMMMMMMMMMMMMMMMMMMEMMMMM', ' '), ('MMMMMMMMMMMMMMMMMnS', ' ')] LibM19140401-V09-02-page4.txt: [('MEENMENMMMMMMMMMaWKNMMMMMEg', ' '), ('MRNMIMMMM-riMMMMMMOAMRIMMMMM', ' ')] LibM19140701-V09-03-page4.txt: [('mmmmmmnswmmEsImmornmmmmmm', ' '), ('MMMMMMMMMMMMMMMMMMMMMM', ' '), ('EHMMMMMMMMMMMMMMMMMMMMMMMMMMMMM', ' ')] LibM19140701-V09-03-page52.txt: [('.ormucesemmommannumorammosimemaamoutammovomnumEammnommuKumumonmustormmummunno', ' '), ('aommommemsatammogarmaxsorarmwelimMelinuilmenomPommixliniewtlominermiimmurpimumnuommurM', ' ')] LibM19141001-V09-04-page4.txt: [('MMEMMMMMMMMMMEEMMMEMMMMMMMMMM', ' ')] LibM19141001-V09-04-page49.txt: [('LMIIIIIIIIIIIIIIIIIII', ' ')] LibM19141001-V09-04-page50.txt: [('MMMMMMMMMMMMM', ' ')] LibM19150101-V10-01-page3.txt: [('mmmmommmmmmmwmnirimEmmmmmm', ' '), ('EMMMMMMMNiMMMMMM', ' ')] LibM19150101-V10-01-page4.txt: [('Mmmnwnsommmmmmmmrmmmolm', ' '), ('MMMMMMMMMMMMMMMMMMMX', ' ')] LibM19150101-V10-01-page52.txt: [('MMMMMMMMMUIMMMMMMRIMMMMMMMMMMMMMMM', ' ')] LibM19150401-V10-02-page3.txt: [('MMMMMMMMERMMMMMMMMMMMMMMM', ' ')] LibM19150401-V10-02-page4.txt: [('MMMMMNEMMOMMMNMMMME', ' '), ('ESNMEMMMMMMRiMMMMEENMMEMMM', ' ')] LibM19150401-V10-02-page49.txt: [('Illlllulllllllllllllllllllllllllllll', ' ')] LibM19150701-V10-03-page3.txt: [('MMMMMMffiRiMMMMMMMMMMMMEMMMMMMM', ' ')] LibM19150701-V10-03-page4.txt: [('moNmmaimmEnimmmmmmmEmm', ' '), ('NFIEMMEEMMEEMMMEEEEEEEEEEEEMEEEM', ' ')] LibM19151001-V10-04-page48.txt: [('MgiMMMMMMMMMMMMMMMMMMMMMMMMMMMM', ' '), ("MMMOSMERMSIMMMEMMNM'iligH", ' ')] LibM19151001-V10-04-page49.txt: [('yffinsmEmmmmmmmwmswmmmmmmmnim', ' '), ('MMMMEiMMIMMMMMMMMMMMMMMTTMMMMMTIM', ' ')] LibM19151001-V10-04-page50.txt: [('Yhmommomownwmmmmmmm', ' ')] LibM19160101-V11-01-page4.txt: [('MMI.I.I.IIWIFINMOMM...MM.M', ' ')] LibM19160101-V11-01-page51.txt: [('rnomommonoszuzummummanmmollommom', ' ')] LibM19160101-V11-01e-page1.txt: [('immumnitommuummunitimmtwuntnimmummiona', ' ')] LibM19160401-V11-02-page26.txt: [('t..glilihiliiiiiiraiii', ' ')] LibM19160401-V11-02e-page1.txt: [('maimimiummaimmismilinuminutimmuminiumilmmitimmummumwmoimminummiumnimmititilowinitimiiiti', ' '), ('lllllllllllllll', ' ')] LibM19170701-V12-03-page3.txt: [('IIIIIIIIIIIII', ' ')] LibM19171001-V12-04-page19.txt: [('YIIIIIIIIIIIIIII', ' ')] LibM19171001-V12-04-page3.txt: [('iiiiiiiiiii.c.ii', ' ')] LibM19180101-V13-01-page18.txt: [('Lffiffimffithimmouninoffimmommummuommunimmonwiniiiiminnumumminriumminlimminiiiiiiiiiiimmonimum', ' '), ('Lffiffimffithimmouninoffimmommummuommunimmonwiniiiiminnumumminriumminlimminiiiiiiiiiiimmonimum', ' ')] LibM19180101-V13-01-page19.txt: [('inimiiiiiminunimmilimumusinominimuninimmilmr.anumilimminiumminnimminumminnummiliniummiliml', ' ')] LibM19180401-V13-02-page3.txt: [('ossionosollSoMaSISMISIIIIIISIIIMISSIMISOMallaallallaillafflUSS', ' '), ('USSOSIIIMIWOHMIIIIIIIISMISOISISIIIIMinallIOISOISOISoisosiososi.', ' '), ('ossionosollSoMaSISMISIIIIIISIIIMISSIMISOMallaallallaillafflUSS', ' ')] LibM19180701-V13-03-page3.txt: [('smossmunssunommummusnmussmssmissussmsmussmmssmissmossmussussummmmusstmosssmsmssmnnsmimmumsmimmwsrmossumms', ' '), ('ssumsffismssumusummummtmussessumnsumussunstsmossmossmwsussmumunnmunsummossumsnwssumminimmsnintminimmusmussinissunues', ' ')] LibM19181001-V13-04-page16.txt: [('iiiiiiiiiiiiiiiiiiiiiii', ' ')] LibM19181001-V13-04-page3.txt: [('nifilnIMIfintilllillflillifilnifilmiummiiffillfill', ' ')] LibM19190101-V15-01-page3.txt: [('siiiiiiilitaiiiiiiiiiaisill', ' '), ('alliallallialliallaillaSSIIIIIIIIIIIMIIIIIIIIIIiiim', ' '), ('alliallallialliallaillaSSIIIIIIIIIIIMIIIIIIIIIIiiim', ' ')] LibM19190401-V15-02-page14.txt: [('IIIIIIIIIIIIIIII', ' '), ('HIIMIIIIIIIIIIIIIIIINIIUMIUMINIMUI', ' ')] LibM19190401-V15-02-page15.txt: [('IWIIIIIIIIIIIIIIIIII', ' ')] LibM19190401-V15-02-page18.txt: [('iiiiiiiinillitiiii', ' ')] LibM19190401-V15-02-page3.txt: [('immumimilimitmliminiiimiiiiiiiiiiiiiiiiiiiimmiiiiiiiiiiimmintmill', ' '), ('atssussusumoususissonclaCIIIMMIIMMISISCOMMISSI', ' '), ('immumimilimitmliminiiimiiiiiiiiiiiiiiiiiiiimmiiiiiiiiiiimmintmill', ' '), ('.MMIMUMUMWWWIIIIIIIIIIIIIIIIIIIIIIIIIIIIIlleeleteeemememme', ' ')] LibM19190701-V15-03-page3.txt: [('IIIIIIIIIIIIIIIIIIII', ' '), ('IIIIIIIIIIIIIIIIIIIII', ' ')] LibM19190701-V15-03-page4.txt: [('IIIIIIIIIIIIIII', ' ')] LibM19191001-V15-04-page14.txt: [('miiiiiniiiiiiimilimiiiiiiiiiiimiliiiiiimmionimmiumingiiiiiiiiiiiimmiliiiimmomiiiiminwiliiiiiiiiiiiiiiiiiinminsummuilimiliiinimonnimmiiiiiiiiiiiiiiiiiiimiimiq', ' '), ('miiiiiniiiiiiimilimiiiiiiiiiiimiliiiiiimmionimmiumingiiiiiiiiiiiimmiliiiimmomiiiiminwiliiiiiiiiiiiiiiiiiinminsummuilimiliiinimonnimmiiiiiiiiiiiiiiiiiiimiimiq', ' ')] LibM19191001-V15-04-page17.txt: [('mimmutinsimiunimminimmummusilinnimmimuminnumminnimmummilinuffisliinummimilmilimitimiumminniiniiitimitimmimmilimititinnum', ' '), ('.itmlinillitiniiimmullimitilittiminunitiffitiminimmituniumnitmitilistimmilimutiiiiiimitimitintiumnimmummitm', ' '), ('unimilismimitimittnismitimmimittlimummumitemitimmummmintimmimiumiumnitimllminiummuntiummilmi', ' '), ('mimmutinsimiunimminimmummusilinnimmimuminnumminnimmummilinuffisliinummimilmilimitimiumminniiniiitimitimmimmilimititinnum', ' '), ('.itmlinillitiniiimmullimitilittiminunitiffitiminimmituniumnitmitilistimmilimutiiiiiimitimitintiumnimmummitm', ' '), ('mimmutinsimiunimminimmummusilinnimmimuminnumminnimmummilinuffisliinummimilmilimitimiumminniiniiitimitimmimmilimititinnum', ' ')] LibM19191001-V15-04-page27.txt: [('IIIMUUMIMIIIIIIMHOMIDfinnlinlinnflUnNHOHHOHIMIMHHOMMIlinlinflO', ' ')] LibM19191001-V15-04-page28.txt: [('HIIIIIIIIIIIIIIIII', ' '), ('IIIIIIIIIIIlIIIIIIII', ' '), ('IIIIIIIIIIIIIIIIIII', ' '), ('IIIIIIIIIIIIIIIIIIIII', ' ')] LibM19200401-V14-02-page16.txt: [('MMIERRIOTITIMMIIEMBITIMIIMERIMIIM', ' '), ('HRIIMMIIIIIMEIMIIIMIHNI', ' ')] LibM19200401-V14-02-page28.txt: [('IIIIIIIIIIIIIIIIIIIIIIIIIII', ' ')] LibM19200401-V14-02-page3.txt: [('iiimmiiiiii..', ' ')] LibM19200401-V14-02-page31.txt: [('.e-illmllommimilimmilummumenimilmnimuningumminumiiiiimilmimmunimifinnilionontimmigimiliiimiffiliffilimiliiiiiiiiiiiiimm', ' '), ('.e-illmllommimilimmilummumenimilmnimuningumminumiiiiimilmimmunimifinnilionontimmigimiliiimiffiliffilimiliiiiiiiiiiiiimm', ' ')] LibM19200401-V14-02-page32.txt: [('smmusissommummusismussmimussissusissimmimmiiiiiiium', ' ')] LibM19200401-V14-02-page36.txt: [('mcommommommuommommonomm', ' '), ('nunnnunnuuuuuuuuuuunnuuuuuuuuuunnnuuunmuuuumusuuuuunuuuuuuuuuuuuuuuununnnnnunuuuuunuuuuuuuuunuwuum', ' '), ('nunnnunnuuuuuuuuuuunnuuuuuuuuuunnnuuunmuuuumusuuuuunuuuuuuuuuuuuuuuununnnnnunuuuuunuuuuuuuuunuwuum', ' ')] LibM19200401-V14-02-page5.txt: [('ENEIIIIIIIESEIIIE', ' ')] LibM19200701-V14-03-page12.txt: [('rrigtreatiariiirriiriiiiriiiitriVittiOriiilrricTiiilriiiitii', ' '), ('NiViitiesiAliffiliifiiilrimlnii', ' ')] LibM19200701-V14-03-page15.txt: [('racTIMIriiiiitiriiltililietcliteiViiVittiiiitiEVAlifittiA', ' ')] LibM19200701-V14-03-page8.txt: [('iitiiiitiWiiiititiVaititigniFittaiiiitQaWilitilitit', ' ')] LibM19201001-V14-04-page3.txt: [('iiiiiiilliniemniiiiii', ' ')] LibM19201001-V14-04-page35.txt: [('mouaamaaammmaaaaaaaaamamanmmammmammimaaaaaamaaaaaammIaaaaaa', ' '), ('MOErlrlrEEMOINMEMMOMMMOMMIKUMOE', ' '), ('IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII', ' ')]
In [36]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction8 Average verified rate: 0.981664985503412 Average of error rates: 0.03307226705796038 Total token count: 1451376
In [37]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[37]:
[("'", 1499), ('m', 1313), ('d', 1253), ('e', 997), ('w', 951), ('t', 834), ('n', 773), ('r', 677), ('f', 631), ('g', 383), ('x', 271), ('u', 206), ('k', 192), ('tv', 150), ('th', 117), ('pa', 100), ('sunday-law', 92), ('z', 82), ('ex', 75), ('io', 71), ('id', 71), ('co', 64), ('postmaster-general', 62), ('re', 59), ('ga', 58), ('post-offices', 57), ('mo', 57), ('un-american', 57), ('statute-books', 56), ('va', 56), ('sunday-closing', 54), ('church-and-state', 49), ('mm', 45), ('q', 44), ('un', 43), ('mt', 42), ('attorney-general', 41), ('tion', 40), ('sunday-rest', 39), ('wm', 38), ('pp', 38), ('charta', 37), ('ro', 35), ('li', 35), ('neander', 31), ('-', 30), ('seventhday', 30), ('mi', 28), ('ky', 28), ('religio-political', 27)]
In [38]:
reports.long_errors(errors_summary, min_length=15)
Out[38]:
(['countermemorialists', 'theconstitutionof', 'well-intentioned', 'antiprohibitionists', 'immmotzmotatmtmommzum', 'sundayobservance', 'disease-resisting', 'less-enlightened', 'vuaziffiemunimeluitennotinutnnifin', 'cavendish-benand', 'comizairadtgicao', 'church-and-stateunion', 'constitutionalty', 'california-nevada', 'winnington-ingram', 'iiiirreriiitlhinifid', 'boarding-schools', 'simmmismwklaiigitil', 're-establishment', 'publishing-houses', 'rwiumwimmiiiiimimmumnii', 'pilurprmarasigimmt', 'preventivejurisdiction', "religio'political", 'miilmilliiimilliifilmidid', 'enosnantiemotainotientetiemtio', 'one-day-in-seven', 'elanornelkiiisre', 'relies-political', 'narrow-mindedness', 'harmless-looking', 'seventh-day-observing', 'jeradycerelsolid', 'ititeiltintonecfctration', 'non-commissioned', 'migininaugimmikimmu', 'latitteilommtwtfifolror', 'mvstimpsmgrecuttliv', 'mgraotrtraccommozraglgraccommicami', 'ex-vice-president', 'prcestantissinium', 'iiiiiiiiiiimmiumulinuilmilne', 'above-referred-to', 'fourteen-per-cent', 'better-established', 'nemmiwiiiimortrinl', 'counter-petitioners', 'inimlfilninninilli', 'non-intoxicating', 'gawavaiaaamminonwirit', 'blood-guiltiness', 'mememeeememememe', 'whowrotethefamous', 'penmenisrisdinaorabsesiceewer', 'affindlitilffilillikvillehd', 'life-disagreeable', 'twice-interrupted', 'antiecclesiastical', 'tinitoriinlintol', 'politico-religious', 'much-appealed-to', 'religiopolitical', 'glillilisibffille', 'unemeeeeeeneeleeneeetelli', 'omortioionososom', 'vaaffisl-co-pacific', 'statesprinciples', 'eimf-immmmnmnmlne', 'religion-and-state', 'rwraeadttchehman', 'long-established', 'msossgmaiaassmgeamakawmalnarlaa', 'lecosniiionpainoticsovicesfirde', "linunimmimrs'inumumu", 'litico-religious', 'rrrprrrrrrrritrrrf', 'lamjukgmdavagixiatm', 'toforeigncountries', 'tixtreciremyemiresnirtiortiorrioritortiorrii', 'democraticrepublican', 'medico-actuarial', 'controversialist', 'hihinhiniiiiiiirin', 'wind-instruments', 'twenty-four-hour', 'two-and-a-halfmile', 'exemption-clause', 'estateifpuprenle', 'commander-in-chief', 'self-determination', 'nmmmommrsonomrznemonmonomnrmotruomonom', 'feeble-mindedness', 'tsereanctosrothciertny', 'church-membership', 'snlrnuurinunuununa', 'rimareinsmiummisimememesiermem', 'tiarezemieeleismikiimeeemiewew', 'emerhilsamalsinalso', 'non-interference', 'burckhardt-schatzmann', 'constitution-makers', "the'constitution", 'feemowiwiedimeiersig', 'one-day-rest-inseven', 'postmaster-general', 'establishingreligious', 'one-day-rest-in-seven', 'shriveled-souled', 'emelieniwionsavibannotisloneemite', 'vice-president-elect', 'self-renunciation', 'self-disciplined', 'innocent-looking', 'fourteen-year-old', 'state-established', 'civilinstitution', 'quasi-conquerors', 'inter-brotherhood', 'anti-evangelical', 'ramtersimrammemarkirracarmermartm', 'mmipoinnonfoemnnioannim', 'nomenegvoicedienast', 'actof-parliament', 'inoomalloisossimis', 'religion-andstate', 'sixteenth-century', 'erimmuralcotemurc', 'uncommercialized', 'iillrieeiaiiirriardi', 'agaomoorwairalioigtiargial', 'ihilibillilltreterita', 'publishing-house', 'vagtookagtookaog', 'selfpreservation', 'secretary-ofthe-interior', 'conscience-fettered', 'maramommraosommu', 'campbell-bannerman', 'character-making', 'religio-political', 'impreeloreesocoeselaal', 'dyed-in-the-wool', 'attorney-general', 'cannikin-clinking', 'ffassininsonsiwoloolgasers', 'lieutenantgovernor', 'jtuemmmmmwinimnir', 'non-sunday-observing', 'mheminuffinfillffilimis', 'sabbath-breaking', 'inforfaisiomomincomocadoviemmigoimiwa', 'sssssssssssssssssssss', 'statesman-preacher', 'prcestantissimum', 'tully-wainwright', 'inter-denominational', 'assumedjimperial', 'statute-preserved', 'nosonmomorwemcwaint', 'reconstructionists', "representatives'", 'iiiwtierttititiiiit', 'no-religious-test', 'ipuitnilinimilliiiinulillluunii', 'friemoossmormior', 'wamegkimnmrummmmesemvmmmrmk', 'miraglia-gullotti', 'nininimummujimininlini', 'self-glorification', 'heaven-appointed', 'sunday-amusement', 'self-aggrandizement', 'avinavvswoirliag', 'word-controversy', 'religious-sabbatic', 'iitoitllislossoliiosill', 'money-worshiping', 'intheszealwarfejrrnicenathemoatiry', 'palace-befitting', 'religio-constitutional', 'personal-heart-conversion', 'american-mexican', 'state-controlled', 'personal-liberty', 'much-to-be-desired', 'curiosity-gratifying', 'jskadmemmomendim', 'mconslfaitmeegtifo', 'relpresentatives', 'non-sectarianism', 'emsmwmmmwmmmnmhoneni', 'governor-general', 'selfaggrandizement', 'religionaboveall', 'faimmeigegrommegfa', 'satisfactostruction', 'sunday-amendment', 'miommooomoomsoicimuchmusuoihiuoimisiummicosississinasseeememeescs', 'monommomozragrammxragnm', 'statuteintrenched', 'teiiiiriafinemie', 'pecsetemmeltigazolom', 'trgatimedimegoovemotwo', 'scandalousassault', "attorney-general's", 'rimmineiiiiiiiiimirre', 'ex-congregational', 'sunday-enforcement', 'beverage-factory', 'selfgratification', 'nitroenrtenaddlimeg', 'mititayerwiriiiinicrierier', 'sundaymailreports', 'liberty-imparting', 'twenty-four-hour-day', 'liberty-bestowing', 'special-delivery', 'misunderstanaing', 'mimmuiummommosowl', 'criiitriatoyearetriarmireirntrecltwieviretriarctieanyaremiractmiteetreowehatio', 'state-and-religion', 'compulsory-sunday-law', 'unconstitunation', 'gishrimmmmomnmon', 'associate-justices', 'secretary-of-war', 'consaalermtooldlny', 'unanswerableness', 'infludemonstrated', 'self-destructive', 'counterdenunciations', 'church-dominated', 'religio-politico', 'thanksgiving-day', 'countermemorials', 'windsor-on-hudson', 'obviouslyagreement', 'busideteriorating', 'one-dayof-rest-in-seven', 'state-intrenched', 'attendstipulating', 'entlimimmimiemil', 'warm-heartedness', 'illrilohlietflir', 'history-confirming', 'semi-ecclesiastical', 'secular-rest-day', 'tomplonsesolomerol', 'intelligent-looking', 'ivosengtoexirmemed', 'suspension-bridges', 'self-righteousness', 'near-prohibition', 'day-rest-in-seven', 'statute-enforced', 'weiverreitaararforreahaarivitoroyerriiivii', 'five-million-dollar', 'twentieth-century', 'politico-ecclesiastical', 'counterallegations', 'alaska-yukon-pacific', 'ecemoictiemememoodemeeeme', 'sabbath-breakers', 'commandment-keepers', 'trothofabusesandegurpatent', 'trading-with-the-enemy', 'go-to-churchor-stay-indoors', 'bureau-of-military-intelligence', 'seventeenth-century', 'self-destruction', 'kiderlen-waechter', 'betterthan-thous', 'antitrinitarians', 'act-of-parliament', 'self-preservation', 'prescott-wilson-tumulty', 'post-reformation', 'brigadier-general', 'government-makers', 'emirmeilsaarsinemiliehmee', 'rograssmargmeermirl', 'ttimilimmumulminnittinitintinninitutimmi', 'frankfort-on-the-main', 'consumption-cure', 'disestablishtvent', 'church-collective', 'religious-legislation', 'intfilnilhimirimihimmihimiirminlnimimiriminiimium', 'one-day-of-rest-in-seven', 'fillikifineffilia', 'parochial-school', 'over-encouraging', 'uvrapsimisulswipampiampv', 'eadergettlevaaled', 'lieutenant-colonel', 'all-comprehensive', 'double-mindedness', 'man-administered', 'counter-movement', 'half-pintof-claret', 'counter-argument', 'world-conscience', 'parochial-school-system', 'ztkirmintzflrmerifranc', 'mramiluesimairrimamesiemiamemilie', 'agretiitilitltitstriffigtisifitiveram', 'microbedestroying', 'wommiumniffunivirlsoir', 'maher-shalal-hash-baz', 'governmentsupported', 'misrepreapproved', 'heaven-enlightened', 'five-hundred-word', 'incomparabiabove', "postmaster-general's", 'succeedinggenerations', 'imememememeinimeimii', 'sunday-observance', 'restaurant-keeper', 'generous-hearted', 'self-contradictory', 'seven-daysa-week', 'ostammosanosonsorr', 'rnitivittiltifirmi', 'muniummmitimlinini', "will-o'-the-wisp", 'counterdemonstrations', 'i-ifidairicliiiriiirroi', 'church-and-state', 'thirty-five-gram', 'thefactthattheyinvolvethevitalprinciple', 'imeemiumeemeemene', 'inmpaiavimmipamipammmiximp', 'demonstrainfluence', 'all-day-everyday', 'self-constituted', 'notpersonalities', 'antichristianism', 'self-condemnatory', 'trust-and-combine', 'separationwhichis', 'director-general', 'vriliriiifiertailitarectrinfeltriatiatictitlifie', 'super-government', 'maher-shalalhash-baz', 'jerusalem-to-jericho', 'nolimmowiftwommr', 'copy-thirty-five', "school-teachers'", 'church-and-state-union', 'mimmomiosomosoissoisioissossosivissossicsiiiss'], 15)
Correction 9 -- Split Squashed Words¶
In [39]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log
prev = cycle
cycle = "correction9"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
verified_tokens = []
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)
tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]
sorted_list_of_words = list(words_sorted_short['token'])
wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = utilities.strip_punct(content)
tokens = utilities.tokenize_text(text)
replacements = []
for token in tokens:
if not token.lower() in spelling_dictionary:
if len(token) > 17:
if re.search(r"[\-\-\'\"]", token):
pass
else:
split_string = clean.infer_spaces(token, wordcost, maxword)
list_split_string = split_string.split()
if clean.verify_split_string(list_split_string, spelling_dictionary):
replacements.append((token, split_string))
else:
pass
else:
pass
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
LibM19080101-V03-01-page22.txt: [('thefactthattheyinvolvethevitalprinciple', 'the fact that they involve the vital principle')] LibM19080401-V03-02-page22.txt: [('countermemorialists', 'counter memorialists')] LibM19090101-V04-01-page10.txt: [('satisfactostruction', 'sat is fact o st r u c t i o n')] LibM19090701-V04-03-page34.txt: [('ffaSSININSONSIWOloolgasers', 'f f a S S I N I N S O N S I W O l o o l g a s e r s')] LibM19101001-V05-04-page1.txt: [('AffindlitilffilillikVillehd', 'A f f i n d l i t i l f f i l i l l i k V i l l e h d')] LibM19110401-V06-02-page49.txt: [('msossgmAIAASSMgEAMAKAWMALNARLAA', 'ms o s s g m A I A A S S M g E A M A K A W M A L N A R L A A')] LibM19111001-V06-04-page18.txt: [('obviouslyagreement', 'obviously agreement')] LibM19120101-V07-01-page50.txt: [('Toforeigncountries', 'To foreign countries')] LibM19121001-V07-04-page6.txt: [('tomplonsesolomerol', 'tom p l on s e s o l o m e r o l'), ('IMpreeloreesocoeselaal', 'IM p reel ore e s o c o e s e l a a l'), ('emerhilsamalsinalso', 'e m e r h i l s a m a l s i n a l s o'), ('consaalermtooldlny', 'con s a a l er m t o o l d l n y')] LibM19130101-V08-01-page27.txt: [('counterdemonstrations', 'counter demonstrations')] LibM19130101-V08-01-page49.txt: [('agaomooRWairaliOigTiargial', 'a g a o m o o R W a i r a l i O i g T i a r g i a l')] LibM19130401-V08-02-page52.txt: [('Ostammosanosonsorr', 'O st am m o s a n o s o n s o r r')] LibM19130701-V08-03-page10.txt: [('lieutenantgovernor', 'lieutenant governor')] LibM19140101-V09-01-page27.txt: [('establishingreligious', 'establishing religious')] LibM19140101-V09-01-page4.txt: [('infOrfaiSIOMOMINCOMOCADOVIEMMIGOIMIWA', 'in f O r f a i S I O M O M I N C O M O C A D O V I E M M I G O I M I W A')] LibM19140701-V09-03-page49.txt: [('SIMMMISMWKlaiigitil', 'S IM M M I S M W K l a i i g i t i l')] LibM19141001-V09-04-page10.txt: [('counterdenunciations', 'counter denunciations')] LibM19150101-V10-01-page53.txt: [('nosonmomorwemcwaint', 'no son mom or we m c w a i n t')] LibM19150701-V10-03-page18.txt: [('governmentsupported', 'government supported')] LibM19160101-V11-01e-page1.txt: [('sssssssssssssssssssss', 's s s s s s s s s s s s s s s s s s s s s')] LibM19161001-V11-04-page22.txt: [('counterallegations', 'counter allegations')] LibM19170101-V12-01-page13.txt: [('democraticrepublican', 'democratic republican')] LibM19180101-V13-01-page5.txt: [('reconstructionists', 'reconstruction i sts')] LibM19180101-V13-01-page9.txt: [('antiprohibitionists', 'anti prohibitionists')] LibM19180401-V13-02-page3.txt: [('inoomalloISOSSIMIS', 'in o o m a l l o I S O S S I M I S'), ('MIMMOMIOSOMOSOISSOISIOISSOSSOSIVISSOSSICSIIISS', 'MIM MOM I O SO M O S O I S S O I S I O I S S O S S O S I V I S S O S S I C S I I I S S')] LibM19180701-V13-03-page12.txt: [('selfaggrandizement', 'self aggrandizement')] LibM19180701-V13-03-page27.txt: [('antiecclesiastical', 'anti ecclesiastical')] LibM19180701-V13-03-page3.txt: [('inimlfilninninilli', 'in im l f i l n i n n i n i l l i')] LibM19181001-V13-04-page13.txt: [('HIHINHINIIIIIIIRIN', 'HI H IN H IN III III IR IN')] LibM19190101-V15-01-page3.txt: [('iitoitllislossoliiosill', 'ii to it l l is loss o l ii o s ill')] LibM19190401-V15-02-page3.txt: [('MIOMMOOOMOOMSOICIMUCHMUSUOIHIUOIMISIUMMICOSISSISSInasseeememeescs', 'M I O M M O O O M O O M S O I C I M U C H M U S U O I H I U O I M I S I U M M I C O S I S S I S S I n a s s e e e m e m e e s c s')] LibM19190701-V15-03-page36.txt: [('lecosniiionpainOticsovicesfirde', 'le c o s n i i i o n p a i n O t i c s o v i c e s f i r d e')] LibM19191001-V15-04-page27.txt: [('MIilmilliiimilliifilMIDID', 'M I i l m i l l i i i m i l l i i f i l M I D I D')] LibM19200401-V14-02-page13.txt: [('IMEMEMEMEMEINIMEIMII', 'I ME ME ME ME ME IN I ME IM II')] LibM19200401-V14-02-page31.txt: [('RIMMINEIIIIIIIIIMIRRE', 'RIM MIN E I I I I I I I I I M I R R E')] LibM19200701-V14-03-page12.txt: [('INMPAIAVIMMIPAMIPAMMMIXIMP', 'IN M P A I A V I M M I P A M I P A M M M I X I M P'), ('iillrieeiaiiirriardi', 'i ill r i e e i a i i i r r i a r d i')] LibM19200701-V14-03-page20.txt: [('UVRAPSIMISULSWIPAMPIAMPV', 'U V R A P S I M I S U L S W I P A M P I A M P V'), ('weiverreitaararforreahaarivitoroyerriiivii', 'we iv err e i t a a r a r f o r r e a h a a r i v i t o r o y e r r i i i v i i')] LibM19200701-V14-03-page3.txt: [('ipuitnIlinimilliiiinulillluunii', 'i p u i t n I l i n i m i l l i i i i n u l i l l l u u n i i')]
In [40]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LibM/correction9 Average verified rate: 0.9815728089947997 Average of error rates: 0.0331019809244314 Total token count: 1452039
In [41]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[41]:
[("'", 1499), ('m', 1365), ('d', 1261), ('e', 1023), ('w', 958), ('t', 844), ('n', 797), ('r', 704), ('f', 643), ('g', 391), ('x', 272), ('u', 218), ('k', 195), ('tv', 150), ('th', 117), ('pa', 100), ('sunday-law', 92), ('z', 82), ('ex', 75), ('io', 71), ('id', 71), ('co', 64), ('postmaster-general', 62), ('re', 59), ('ga', 58), ('post-offices', 57), ('mo', 57), ('un-american', 57), ('statute-books', 56), ('va', 56), ('sunday-closing', 54), ('church-and-state', 49), ('mm', 45), ('q', 44), ('un', 43), ('mt', 42), ('attorney-general', 41), ('tion', 40), ('sunday-rest', 39), ('wm', 38), ('pp', 38), ('charta', 37), ('ro', 35), ('li', 35), ('neander', 31), ('-', 30), ('seventhday', 30), ('mi', 28), ('ky', 28), ('religio-political', 27)]
In [42]:
reports.docs_with_high_error_rate(summary)
Out[42]:
[('LibM19200401-V14-02-page4.txt', 1.0), ('LibM19060401-V01-01-page2.txt', 1.0), ('LibM19140701-V09-03-page52.txt', 0.857), ('LibM19110701-V06-03-page1.txt', 0.824), ('LibM19080101-V03-01-page1.txt', 0.812), ('LibM19090401-V04-02-page33.txt', 0.778), ('LibM19110101-V06-01-page1.txt', 0.769), ('LibM19191001-V15-04-page28.txt', 0.75), ('LibM19110701-V06-03-page4.txt', 0.729), ('LibM19080401-V03-02-page19.txt', 0.714), ('LibM19080701-V03-03-page1.txt', 0.687), ('LibM19100101-V05-01-page1.txt', 0.676), ('LibM19111001-V06-04-page1.txt', 0.667), ('LibM19090401-V04-02-page1.txt', 0.667), ('LibM19110401-V06-02-page1.txt', 0.662), ('LibM19081001-V03-04-page1.txt', 0.66), ('LibM19080401-V03-02-page1.txt', 0.659), ('LibM19130701-V08-03-page2.txt', 0.657), ('LibM19100401-V05-02-page1.txt', 0.645), ('LibM19080701-V03-03-page41.txt', 0.619), ('LibM19100701-V05-03-page1.txt', 0.615), ('LibM19140401-V09-02-page1.txt', 0.611), ('LibM19120701-V07-03-page4.txt', 0.605), ('LibM19090401-V04-02-page2.txt', 0.6), ('LibM19170401-V12-02-page1.txt', 0.583), ('LibM19150401-V10-02-page1.txt', 0.577), ('LibM19170701-V12-03-page1.txt', 0.566), ('LibM19180101-V13-01-page4.txt', 0.558), ('LibM19090101-V04-01-page1.txt', 0.543), ('LibM19060401-V01-01-page35.txt', 0.529), ('LibM19191001-V15-04-page1.txt', 0.524), ('LibM19170701-V12-03-page4.txt', 0.5), ('LibM19120401-V07-02-page4.txt', 0.5), ('LibM19101001-V05-04-page1.txt', 0.478), ('LibM19180101-V13-01-page1.txt', 0.471), ('LibM19121001-V07-04-page1.txt', 0.471), ('LibM19190701-V15-03-page36.txt', 0.456), ('LibM19200101-V14-01-page1.txt', 0.455), ('LibM19121001-V07-04-page6.txt', 0.45), ('LibM19070401-V02-02-page36.txt', 0.444), ('LibM19090701-V04-03-page1.txt', 0.419), ('LibM19111001-V06-04-page52.txt', 0.407), ('LibM19151001-V10-04-page7.txt', 0.406), ('LibM19190401-V15-02-page1.txt', 0.4), ('LibM19200401-V14-02-page35.txt', 0.393), ('LibM19071001-V02-04-page18.txt', 0.389), ('LibM19130701-V08-03-page26.txt', 0.387), ('LibM19140101-V09-01-page56.txt', 0.383), ('LibM19141001-V09-04-page52.txt', 0.363), ('LibM19071001-V02-04-page51.txt', 0.36), ('LibM19140701-V09-03-page1.txt', 0.333), ('LibM19200701-V14-03-page1.txt', 0.333), ('LibM19080101-V03-01-page2.txt', 0.333), ('LibM19201001-V14-04-page1.txt', 0.333), ('LibM19120401-V07-02-page40.txt', 0.32), ('LibM19090401-V04-02-page51.txt', 0.312), ('LibM19160701-V11-03-page1.txt', 0.312), ('LibM19140701-V09-03-page4.txt', 0.307), ('LibM19090701-V04-03-page51.txt', 0.294), ('LibM19180701-V13-03-page4.txt', 0.294), ('LibM19140701-V09-03-page49.txt', 0.288), ('LibM19150101-V10-01-page1.txt', 0.286), ('LibM19190101-V15-01-page4.txt', 0.273), ('LibM19130701-V08-03-page27.txt', 0.273), ('LibM19090101-V04-01-page21.txt', 0.262), ('LibM19060401-V01-01-page36.txt', 0.25), ('LibM19200401-V14-02-page1.txt', 0.25), ('LibM19120401-V07-02-page38.txt', 0.245), ('LibM19151001-V10-04-page1.txt', 0.24), ('LibM19090101-V04-01-page52.txt', 0.239), ('LibM19130401-V08-02-page1.txt', 0.235), ('LibM19121001-V07-04-page4.txt', 0.233), ('LibM19120701-V07-03-page1.txt', 0.231), ('LibM19141001-V09-04-page1.txt', 0.222), ('LibM19070401-V02-02-page35.txt', 0.222), ('LibM19130101-V08-01-page1.txt', 0.214), ('LibM19170401-V12-02-page4.txt', 0.214), ('LibM19131001-V08-04-page1.txt', 0.211), ('LibM19080701-V03-03-page52.txt', 0.21), ('LibM19080101-V03-01-page51.txt', 0.208), ('LibM19130701-V08-03-page1.txt', 0.208), ('LibM19100101-V05-01-page31.txt', 0.206), ('LibM19090701-V04-03-page42.txt', 0.206), ('LibM19150101-V10-01-page12.txt', 0.202)]
In [43]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.5]
# utilities.open_original_docs(doc_keys, directories['cycle'])
In [ ]: