WHM-OCR-Evaluation-and-Correction
In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt",
"2016-12-07-SDA-place-names.txt",
"2016-12-08-SDA-Vocabulary.txt",
"2017-01-03-place-names.txt",
"2017-02-14-Base-Word-List-SCOWL&KJV.txt",
"2017-02-14-Roman-Numerals.txt",
"2017-03-01-Additional-Approved-Words.txt"
]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "WMH"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)
Baseline¶
In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/baseline Average verified rate: 0.9316708765632767 Average of error rates: 0.07061265580057527 Total token count: 939453
In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 500 )
Out[11]:
[('-', 1687), ('m', 1646), ('w', 1492), ('g', 1421), ('d', 1246), ('e', 1229), ('¥', 881), ('re-', 816), ('con-', 748), ('tion', 679), ('r', 664), ('n', 633), ('in-', 539)]
Review Special Character Use¶
In [12]:
reports.tokens_with_special_characters(errors_summary)
Out[12]:
[('¥', 881), (')', 432), ('(', 368), ('ñ', 248), ('/', 171), ('ñthe', 100), ('_', 81), ('presidentña', 75), ('numbess)in', 70), ('¡', 65), ('educationñprof', 62), ('numbers)in', 62), ('%', 62), ('treasurerñe', 61), ('secretaryñm', 61), ('ñmargaret', 58), ('(to', 52), ('\\', 41), ('presidentñs', 38), ('ñelder', 37), ('treasñjennie', 35), ('ñselected', 35), ('presidentñm', 32), ('ña', 32), ('*', 31), ('//', 25), ('numbers)', 25), ('ã', 24), ('ñwe', 23), ('¢', 22), ('ô', 21), ('(the', 20), ('ñi', 20), ('numbeps)in', 19), ('numbees)in', 17), ('(b)', 16), ('ñthat', 16), ('(concluded', 16), ('¥the', 16), ('ñdied', 16), ('secretaryñs', 15), ('(a)', 15), ('ñhattie', 15), ('treasurerñd', 15), ('ñbrother', 13), ('(and', 13), ('ñand', 12), ('i)', 12), ('(ps', 12), ('(see', 12), ('(c)', 11), ('ñin', 11), ('(tithe)', 11), ('in¥', 11), ('ñmiss', 11), ('(john', 10), ('¥of', 9), ('quartetñ', 9), ('ñno', 9), ('ñdr', 8), ('ñfrom', 8), ('ñmrs', 8), ('ñas', 8), ('///', 8), ('•', 8), ('songñ', 8), ('(rev', 8), ('numbevs)in', 8), ('(heb', 7), ('numbews)in', 7), ('ñour', 7), ('(not', 7), ('numbeas)in', 7), ('(a', 7), ('continued)', 7), ('`', 7), ('ñw', 7), ('\\vest', 7), ('(isa', 6), ('\\\\', 6), ('nña', 6), ('ñthis', 6), ('ñan', 6), ('ñat', 6), ('ñsuccess', 6), ('(acts', 6), ('(d)', 6), ('ñit', 6), ('ñs', 6), ('(continued', 6), ('¦', 6), ('ó', 6), (']', 6), ('(matt', 5), ('given)', 5), ('(sunday)', 5), ('ñone', 5), ('ñministry', 5), ('homeñthe', 5), ('(job', 5), ('sabbath¥school', 5), ('ññ', 5), ('(field', 5), ('ñnot', 5), ('(mrs', 5), ('¥we', 5), ('ñmen', 5), ('¥¥', 5), ('=', 5), ('educationña', 5), ('ñall', 5), ('michã', 5), ('()reek', 5), ('numbess)ln', 5), ('the¥', 5), ('ñprof', 5), ('ñella', 5), ('(i)', 4), ('(or', 4), ('ñthey', 4), ('ñsome', 4), ('ñgospel', 4), ('ñsister', 4), ('\ufeff', 4), ('\\\\\\\\', 4), ('(ex', 4), ('\\v', 4), ('numbcps)in', 4), ('ñj', 4), ('[', 4), ('wantedña', 4), ('ñfor', 4), ('i/', 4), ('(we', 4), ('numbecs)in', 4), ('(paper)', 4), ('ãã', 4), ('numbens)in', 4), ('(which', 4), ('the_', 4), ('ñm', 4), ('(luke', 4), ('¥in', 4), ('(i', 4), ('¥¥¥', 4), ('ñeld', 4), ('ñrev', 3), ('(this', 3), ('purposeñto', 3), ('/-', 3), ('(there', 3), ('(read', 3), ('he¥', 3), ('#', 3), ('be¥', 3), ('ñgeorge', 3), ('ob¥', 3), ('io¢', 3), ('(deut', 3), ('and¥', 3), ('[john', 3), ('(g)', 3), ('termñbible', 3), ('ñremember', 3), ('(so', 3), ('hymnñ', 3), ('ñtestimonies', 3), ('(cloth', 3), ('(f)', 3), ('(mal', 3), ('ñof', 3), ('¥to', 3), ('¥do', 3), ('(margin', 3), ('(in', 3), ('ñsimply', 3), ('(e)', 3), ('to¥', 3), ('saleña', 3), ('~', 3), ('¥and', 3), ('conference(tithe)', 3), ('¥been', 3), ('°', 3), ('tion)', 3), ('¡-', 3), ('(he', 3), ('o%', 3), ('ç', 3), ('ñlast', 3), ('ñyes', 3), ('}', 3), ('ñif', 3), ('ñis', 3), ('`great', 3), ('allñthe', 3), ('michiganñ', 3), ('ñmarianne', 3), ('byñ', 3), ('`the', 3), ('ñbible', 3), ('ñto', 3), ('(prov', 3), ('~~', 3), ('/i', 3), ('`object', 3), ('not¥', 3), ('(for', 3), ('christñ', 2), ('each)', 2), ('was¥', 2), ('¥a', 2), ('¥-', 2), ('igo+', 2), ('educationñits', 2), ('numbests)in', 2), ('i~n-', 2), ('with¥', 2), ('comfortñ', 2), ('/(', 2), ('can_', 2), ('*read', 2), ('(july', 2), ('¥ñ', 2), ('reading)', 2), ('ñh', 2), ('ñwhether', 2), ('ñc', 2), ('restñ', 2), ('/e', 2), ('ñthere', 2), ('%%', 2), ('(dt', 2), ('ñso', 2), ('ñf', 2), ('(new', 2), ('(as', 2), (')))', 2), ('((that', 2), ('young*', 2), ('these¥', 2), ('(christ)', 2), ('carñoh', 2), ('ñeven', 2), ('_the', 2), ('work¥', 2), ('ñread', 2), ('ex¥', 2), ('wig)', 2), ('workñnot', 2), ('ñwith', 2), ('(vs', 2), ('(without', 2), ('[should]', 2), ('¥who', 2), ('on¥', 2), ('them)', 2), ('-ô', 2), ('ñhealth', 2), ('ñever', 2), ('grammarñcomplete', 2), ('usedñthe', 2), ('+', 2), ('to¢', 2), ('ant)', 2), (')ñ', 2), ('back)', 2), ('[for', 2), ('>', 2), ('christ)', 2), ('this¥', 2), ('ñjames', 2), ('beñ', 2), ('(vol', 2), ("'¥", 2), ('church)', 2), ('ñhad', 2), ('(h)', 2), ('ñtwo', 2), ('ñare', 2), ('guidanceñ', 2), ("['sego", 2), ('paperñduties', 2), ('foodñ', 2), ('/a', 2), ('o¢', 2), ('ñreports', 2), ('r/', 2), ('\\k', 2), ('(april', 2), ('tions)', 2), ('ñwhen', 2), ('soloñ', 2), ('n¢', 2), ('quartetteñ', 2), ('childñhis', 2), ('ñfebruary', 2), ('in*', 2), ('(verse', 2), ('¥for', 2), ("///'", 2), ('numbers)ln', 2), ('ñsel', 2), ('camp¥meeting', 2), ('lord)', 2), ('ñseveral', 2), ('%two', 2), ('-¥', 2), ('drinkñ', 2), ('eternityñ', 2), ('¥c', 2), ('(those', 2), ('ñherrick', 2), ('sec¥', 2), ('fearsñhe', 2), ('(concluded)', 2), ('ñu', 2), ('ñsir', 2), ('(front', 2), ('subscriptions)', 2), ('page)', 2), ("'/", 2), ('father)', 2), ('greek]', 2), ('sabbath¥', 2), ('(iii', 2), ('franciscoñfell', 2), ('ñmembers', 2), ('(nov', 2), ('(isaiah', 2), ('(all', 2), ('(minister)', 2), ('they_', 2), ('smileñ', 2), ('ñjohn', 2), ('itñ', 2), ('his¥', 2), ('/#', 2), ('ñwill', 2), ('`we', 2), ('ñlittle', 2), ('presidentñ', 2), ('(-', 2), (')))))', 2), ('ñd', 2), ('camp¥', 2), ('it¥', 2), ('ñsabbath', 2), ('_in', 2), ('trueñ', 2), ('(money', 2), ('ñwas', 2), ('saleñforty-acre', 2), ('%v', 2), ('(paper', 2), ('bibleñold', 2), ('ñby', 2), ('(poetry', 2), ('ñprofessor', 2), (')(', 2), ('(even', 2), ('ñbut', 2), ('god)', 2), ('(ga', 2), ('(note', 2), ('(ecc', 2), ('paperñhow', 2), ('to¥show', 2), ('¥they', 2), ('(swedish)', 2), ('is¥', 2), ('to-day)', 2), ('ñthose', 2), ('paperñthe', 2), ('prayerñ', 2), ('♦', 2), ('saysñ', 2), ('(jno', 2), ('bookñthe', 2), ('¥be', 2), ('ñwhat', 2), ('re¥', 2), ('(life', 2), ('__', 2), ('(col', 2), ('_this', 2), ('heartñ', 2), ("¥'", 2), ('numbers)i', 2), ('[tight', 2), ('joyñ', 2), ('more¥', 2), ('ñabraham', 2), ('(psalms', 2), ('a¥', 2), ('(with', 2), ('`it', 2), ('ñwhich', 2), ('and_', 2), ('anythingñ', 2), ('is_', 2), ('that¥', 2), ('ñ-', 2), ('_have', 2), ('ñcamp-meetings', 2), ("(god's)", 1), ('`ye', 1), ('¥ence', 1), ('health]', 1), ('_lessons', 1), ('ñcollege', 1), ('countryñmussoorie', 1), ('ñhistorical', 1), ('(tile', 1), ('margaret¥ilaughey', 1), ('(twins', 1), ('gui)', 1), ('criti¥', 1), ('ques¥', 1), ('heartsñto', 1), ("'ô\\", 1), ('\\\\e', 1), ('_materials', 1), ('twoñgeneral', 1), ('[or', 1), ('to¥your', 1), ('burnhamñallegan', 1), ('baffledñdestroyed', 1), ('a\\mir', 1), ('was]', 1), ('`prepare', 1), ('(bishop)', 1), ('=pill/irk', 1), ('deredñ', 1), ('r)r', 1), ('//ii/', 1), ('before)', 1), ('michôan', 1), ('^', 1), ('(actions', 1), ('whileñis', 1), ('purposeñabraham', 1), ('(adopted', 1), ('%%us', 1), ('numbers)in-advance', 1), ('mentionedñconducting', 1), ('¥=', 1), ('_here', 1), ('and¥sisters', 1), ('every-__', 1), ('(margin)', 1), ('previous¥', 1), ('veas(', 1), ('possible¡', 1), ('illgami/', 1), ('causeñan', 1), ('(german', 1), ('ii)', 1), ('of¥', 1), ('stateñsome', 1), ('merriamñlowell', 1), ('(adv', 1), ('ined)', 1), ('shawñdied', 1), ('be=', 1), ('lettersñthe', 1), ('adam¥transgressed', 1), ('(forty', 1), ('wic/', 1), ('\\j', 1), ('the`lord', 1), ('out¥', 1), ('ñeugene', 1), ('pm/', 1), ('trioñ', 1), ('m(', 1), ('deposits)', 1), ('ñordis', 1), (')}', 1), ('¨f', 1), ('center¥ñ', 1), ('convenñ', 1), ('scherzoñ', 1), ('exerciseñthe', 1), ('peo¥', 1), ('says)', 1), ('cx)', 1), ('or¥', 1), ('ñ=', 1), ('trainingñthe', 1), ('et*', 1), ("stringsñsailor'", 1), ('¥edward', 1), ('drillsñreading', 1), ('ñyouth', 1), ('(about', 1), ('v/', 1), ('_consideration', 1), ('lord¥', 1), ('atedña', 1), ('ñought', 1), ('mornñso', 1), ('ñoh', 1), ('prayers_', 1), ('mer_', 1), ('godñsome', 1), ('p/a', 1), ('dieñas', 1), ('%ell', 1), ('statementñ', 1), ('me)', 1), ('publishing_', 1), ("worldã'¥", 1), ("'ñand", 1), ('bodyña', 1), ('(that', 1), ('praise¥god', 1), ('wilburñportland', 1), ('preparationñits', 1), ('(ii', 1), ('meansñmen', 1), ("(/'", 1), ('ãg', 1), ('(vest', 1), ('compassionñ', 1), ('_lumber', 1), ('ñon', 1), ('/efr/', 1), ('willing¥', 1), ('ñeben', 1), ('thoughtñdivine', 1), ('pro¥', 1), ('ñworld', 1), ('necessaryñ', 1), ('exercisedñthe', 1), ('(excluding', 1), ('not¥in', 1), ('yardñwinifred', 1), ('oc)', 1), ("wr'%ô", 1), ('baptizedñthis', 1), ('the¥formation', 1), ('zw/i/', 1), ('arithmeticñcomplete', 1), ('overlookedñthe', 1), ('heartñgrowing', 1), ('ver)', 1), ('artñall', 1), ('each¥way', 1), ('bibleñchurch', 1), ('wordsñand', 1), ('some)', 1), ('morningñwhen', 1), ('accomplishñ', 1), ('tionñfurnishes', 1), ('s\\', 1), ('yô', 1), ('it/', 1), ('known)', 1), ('ñsabbath-', 1), ('¥from', 1), ('wa¤', 1), ('tory)', 1), ('downñthe', 1), ('ñfell', 1), ('valie¥', 1), ('a%\\', 1), ('cottñan', 1), ('a*', 1), ('standsñis', 1), ('friendñ', 1), ('ñper-', 1), ('goñall', 1), ('years)', 1), ('nersñand', 1), ('lith**', 1), ('ñarticles', 1), ("ô'd", 1), ('baptistñmillie', 1), ('membersñtwo', 1), ('ôi', 1), ('distantña', 1), ('stormôso', 1), ('lostñat', 1), ('(ger-', 1), ('trial¥', 1), ('pesveas(', 1), ('made¥', 1), ('%or', 1), ('here¥and', 1), ('numbcps)', 1), ('(absolute)', 1), ("ã'", 1), ('*have', 1), ('(mar-', 1), ('(d', 1), ('beastñthe', 1), ('(broth-', 1), ('inheritedñ', 1), ('a)', 1), ('teachersñ(', 1), ('ñforty', 1), ('((armed¥', 1), ('(virginia)', 1), ('pierceñmrs', 1), ('all¥the', 1), ('[not', 1), ('firga/', 1), ('emptyñcontribute', 1), ('thingñonly', 1), ('(log', 1), ('infancy)', 1), ('_---', 1), ("¥'we", 1), ('holding_', 1), ('lôilorning', 1), ('mountainsñwas', 1), ('institu¥', 1), ('then_', 1), ('ñtwenty-two', 1), ('does¥', 1), ('wrong¥', 1), ('meñif', 1), ('primaryãand', 1), ('èè', 1), ('\\varner', 1), ('rhetoricñkellogg', 1), ('ninthñnever', 1), ('translation)', 1), ('ñatlantic', 1), ('when_', 1), ("'illl~l", 1), ('ãmin', 1), ('itumegoc(', 1), ('¥incomparable', 1), ('appoint=', 1), ('ãli', 1), ('secondñdrink', 1), ('feelñwell', 1), ('comeñlet', 1), ('ñeating', 1), ('_sister', 1), ('iff(iii', 1), ('¥life', 1), ("'¥'", 1), ('pesyeas(', 1), ('withrowñdied', 1), ('recitationñ', 1), ('ñsaving', 1), ('(retail', 1), ('fit*takki', 1), ("curse')", 1), ('diedñin', 1), ('ñr', 1), ('(ise', 1), ('jam(', 1), ('egypt)', 1), ('a/', 1), ('(ind', 1), ('placesñthirty-three', 1), ("botanyñleavitt's", 1), ('ringsñby', 1), ('continued¥)', 1), ('z/g¥', 1), ('(board', 1), ('vs¥m', 1), ('godña', 1), ('cal*', 1), ('ñbrethren', 1), ('scaledñgod', 1), ('(denomi-', 1), ('(africa)', 1), ('ñhalf', 1), ('t*', 1), ('ammo(', 1), ('of`the', 1), ('property_', 1), ('use¥of', 1), ('nexus¥', 1), ('new¥', 1), ('(apostolic', 1), ('(march', 1), ('tistiofflau_j', 1), ('¥ations', 1), ('ñliquor', 1), ('(no', 1), ('minutes)', 1), ('crafts)', 1), ('ñstephen', 1), ('awayñtheir', 1), ('fieldñ', 1), ('ñwell', 1), ('ñor', 1), ('christñwho', 1), ('workñbeing', 1), ('paperñhave', 1), ('ñhe', 1), ('%%mo', 1), ('ñmain-', 1), ('(saturday)', 1), ('(danish-', 1), ('prophetically)', 1), ('deliver¥', 1), ('çflaiii', 1), ('\\\\ittuto', 1), ('¥ten', 1), ('edu¥', 1), ('riversñfifteen', 1), ('haugheyñotsego', 1), ('knowñi', 1), ('numbems)in', 1), ('/tioheagigt', 1), ('flowñ', 1), ('ñready', 1), ('~niiii', 1), ('*you', 1), ('letterñfrom', 1), ('jo¡', 1), ('corm)', 1), ('land)', 1), ('we¥are', 1), ('ñgeneral', 1), ('*two', 1), ('(should', 1), ("\\ctrir''", 1), ('tenthñ', 1), ('areña', 1), ('loi#d', 1), ('(under', 1), ('(especially', 1), ('two¥', 1), ('meri¥', 1), ('¥corliss', 1), ('are¥', 1), ('ñtogether', 1), ('thousand¥', 1), ('%vest', 1), ('t-}', 1), ('sorrowñ', 1), ('aboveñcause', 1), ('butterfieldñbuchanan', 1), ('spearñfell', 1), ('appe¥', 1), ('christñhe', 1), ('(except', 1), ('agesñ', 1), ('w//', 1), ('`m~d', 1), ('[the', 1), ('bath¥keepers', 1), ('¡heaven', 1), ('ñtuesday', 1), ('distanceñthe', 1), ('¥kalama', 1), ('c)', 1), ('(v', 1), ('_read', 1), ('tentsñone', 1), ('_e__zeo', 1), ('shriekñ', 1), ('ñenough', 1), ('tentsñthe', 1), ('(they', 1), ('homeñ', 1), ('fb/', 1), ('copyñthe', 1), ('smithñgrandville', 1), ('plifitt/', 1), ('satan¥', 1), ('departmentñtwo', 1), ('bandñ', 1), ('ñtemporal', 1), ('grandville_', 1), ('privilege/to', 1), ('faultsñshould', 1), ('answerñ', 1), ('(whatsoever', 1), ('body)', 1), ('[c]', 1), ('ac}', 1), ('¥usñthe', 1), ('portunity¥', 1), ('o/', 1), ('ñreasons', 1), ('r¢', 1), ('year(', 1), ('some`consideration', 1), ('ñmeetings', 1), ('()rues¡', 1), ('il/', 1), ('standpointña', 1), ('ñhand', 1), ('unionñfrank', 1), ('¡()', 1), ('wantedñto', 1), ("'%\\%", 1), ('scienceñelementary', 1), ('orphanñit', 1), ('¥planting', 1), ('(bo', 1), ('ñspeaking', 1), ('found¥the', 1), ('a/pfi', 1), ('¥ed', 1), ('`\\_\\_', 1), ('(two', 1), ('meet=', 1), ('\\\\ô\\', 1), ('aon¥', 1), ('%moo', 1), ('areñthe', 1), ('a(a', 1), ('thatñ', 1), ("under'compulsion)", 1), ('-_', 1), ('ãt', 1), ('ho\\tever', 1), ('hillginc/', 1), ('discus-¥', 1), (')im', 1), ('(his', 1), ('go%', 1), ('ingsña', 1), ('pennsyl-(', 1), ('*heaven', 1), ("tm'\\", 1), ('reveals¥', 1), ('vvr/rip', 1), ('worldñthey', 1), ('*out', 1), ('(forces)', 1), ('laterñ', 1), ('is`situated', 1), ('graceñlove', 1), ('¥rela-', 1), ('ñmay', 1), ('(illus', 1), ('ñlet', 1), ("'wm*", 1), ('at¥corn', 1), ('itñthat', 1), ('up¥and', 1), ('holidayñdied', 1), ('yearñ', 1), ('shoulc_lopot', 1), ('there¥', 1), ('oneñto', 1), ('(symbolically', 1), ('prospectñwe', 1), ('turn_pale', 1), ("ãa'", 1), ('g¥', 1), ('(pest', 1), ('questionñis', 1), ("'ñone", 1), ('/inj', 1), ('_effect', 1), ('society(', 1), ('collardñdied', 1), ('ft/', 1), ('¥secretary', 1), ("\\n\\'", 1), ('(twelve', 1), ('bornñon', 1), ('may¥', 1), ('ñrepairs', 1), ('fãigr', 1), ("~iqiiiidiiiniinii(i'''", 1), ("/'i", 1), ('fridayñprepared', 1), ('_-', 1), ('governorñduties', 1), ('extras)', 1), ('history¥', 1), ('ere/', 1), ('*is%', 1), ('often*', 1), ('cudneyñdied', 1), ('(they)', 1), ('youñyou', 1), ('con_erning', 1), ('classesñintermediate', 1), ('roil/', 1), ('ii¥', 1), ('of/', 1), ('ñeach', 1), ("ñ'", 1), ('\\yam', 1), ('_apply', 1), ('`illessed', 1), ('*and', 1), ('(ilatchman', 1), ('resum¥', 1), ('ñlegislative', 1), ('peap(', 1), ('<', 1), ('/cartoinmtza', 1), ('fie/l(', 1), ('messageñto', 1), (')he', 1), ('_god', 1), ('smithyñlesson', 1), ('whiteñthe', 1), ('re/', 1), ('sugar)', 1), ("('", 1), ('ñhave', 1), ('_much', 1), ('\\ô\\', 1), ('c_aivy', 1), ('for_room', 1), ('largeñone', 1), ('*licentiates', 1), ('influenceñin', 1), ('sutherland)', 1), ('k/aw', 1), ('ñdo', 1), ('ããilicom', 1), ('treasurerñ', 1), ('eoñternal', 1), ('tells_', 1), ('jill)', 1), ('before¥', 1), ('¥v', 1), ('`cast', 1), ('wedgeñkindness', 1), ('s¥ix', 1), ('illl~', 1), ('//mi', 1), ('kind)', 1), ('bless¥', 1), ("/g'", 1), ('poundñhath', 1), ('general_', 1), ('israelñ', 1), ('-•', 1), ('(such', 1), ('ureclly_il', 1), ('affliction_', 1), ('thisñthere', 1), ('ñnow', 1), ('solveñ', 1), ('*in', 1), ('house)', 1), ('ñduring', 1), ('freedomñ', 1), ('g\\ta', 1), ('¥bible', 1), ('other_', 1), ('(t', 1), ('aroundñ', 1), ('sundayñin', 1), ('athafitov/iati', 1), ('produce_a', 1), ('the¥thanksgiving', 1), ('/z//', 1), ('(story)', 1), ('myself)', 1), ('an¥', 1), ('*conference', 1), ('after¥', 1), ('ñsigns', 1), ('mcmorran*', 1), ('¥training', 1), ('ñen', 1), ('(george', 1), ('ordersñin', 1), ('seventhñbe', 1), ('edñdr', 1), ('ñ`a', 1), ('%ago', 1), ('vegetarianismñits', 1), ('conventionsñin', 1), ('%*', 1), ('(lowelf)', 1), ('_someone', 1), ('(let', 1), ('very¥', 1), ('siteaga_', 1), ('`value', 1), ('ñdwelling', 1), ('ñhattiee', 1), ('speakñmen', 1), ('pleasures)', 1), ('ò', 1), ('snunbers)in', 1), ('crossñ', 1), ('ñthese', 1), ('ro%', 1), ('mel/', 1), ('ñsunday-closing', 1), ('cheapñrubber-tired', 1), ('`permit', 1), ('(church', 1), ('stateñpublishes', 1), ("salvation'of¥", 1), ('bodiesñlet', 1), ('my_', 1), ('yeas(', 1), ("volun¥teers'", 1), ...]
Correction 1 -- Normalize Characters¶
In [14]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
# Substitute for all other dashes
content = re.sub(r"—-—–‑", r"-", content)
# Substitute formatted apostrophe
content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
# Replace all special characters with a space (as these tend to occur at the end of lines)
content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/correction1 Average verified rate: 0.9377967276021958 Average of error rates: 0.0643058485139022 Total token count: 938150
In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('m', 1749), ('-', 1720), ('w', 1503), ('g', 1434), ('e', 1307), ('d', 1276), ('re-', 816), ('con-', 750), ('tion', 684), ('r', 681), ('n', 644), ('in-', 539), ("'", 507), ('be-', 471), ('f', 444), ('t', 381), ('de-', 377), ('com-', 339), ('ex-', 332), ('michi-', 328), ('th', 289), ('sab-', 285), ('ment', 283), ('ence', 267), ('en-', 233), ('peo-', 226), ('sabbath-', 223), ('ly', 220), ('ference', 212), ('ple', 207), ('confer-', 207), ('pre-', 203), ('tions', 189), ('ad-', 186), ('dis-', 178), ('at-', 173), ('oo', 172), ('im-', 167), ('mis-', 164), ('un-', 163), ('meet-', 162), ('ers', 162), ('ac-', 161), ('pro-', 153), ('per-', 146), ('ber', 137), ('io', 117), ('ap-', 116), ('ren', 114), ('ary', 113)]
Correction 2 -- Connect Line Endings¶
In [20]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/correction2 Average verified rate: 0.9726863553068523 Average of error rates: 0.029485139022051778 Total token count: 915147
In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[('m', 1747), ('-', 1700), ('w', 1503), ('g', 1434), ('e', 1307), ('d', 1275), ('r', 680), ('n', 644), ("'", 507), ('f', 443), ('t', 377), ('th', 283), ('oo', 171), ('sabbathschool', 163), ('io', 117), ('mt', 108), ('k', 106), ('co', 102), ('ro', 94), ('wm', 82), ('numbess', 75), ('u', 69), ("'field", 67), ("canvassers'", 58), ('--', 50), ('x', 46), ("'the", 44), ('horr', 39), ("the'", 38), ('rd', 33), ('blendon', 32), ('mid-summer', 32), ('brower', 31), ("f'd", 30), ('-the', 29), ('harnden', 29), ('mchugh', 29), ('nd', 28), ('seventhday', 28), ('cleora', 27), ('ex', 26), ('tion', 25), ('sabbathschools', 23), ('q', 23), ('nunica', 23), ('con-', 22), ("'to", 22), ('vowyla', 21), ('-and', 21), ('loth', 20)]
Correction 3 -- Remove extra dashes¶
In [26]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
replacements = []
for token in tokens:
if token[0] is "-":
replacements.append((token, token[1:]))
elif token[-1] is "-":
replacements.append((token, token[:-1]))
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
WMH19030128-V01-04-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('labor-', 'labor'), ('-', '')] WMH19030128-V01-04-page2.txt: [('-Prpartittrrit.', 'Prpartittrrit.')] WMH19030128-V01-04-page3.txt: [('-', ''), ('HER-', 'HER')] WMH19030128-V01-04-page4.txt: [('-', ''), ('Mich-', 'Mich')] WMH19030204-V01-05-page1.txt: [('-', ''), ('fin-', 'fin'), ('-', ''), ('-', '')] WMH19030204-V01-05-page2.txt: [('purit-', 'purit')] WMH19030211-V01-06-page1.txt: [('-Is', 'Is'), ('-', ''), ('-', '')] WMH19030211-V01-06-page3.txt: [('partic-', 'partic')] WMH19030311-V01-10-page2.txt: [('ambush-', 'ambush')] WMH19030311-V01-10-page3.txt: [('morn-', 'morn')] WMH19030311-V01-10-page4.txt: [('en-', 'en'), ('TRAVIS.-', 'TRAVIS.'), ('WILBUR.-', 'WILBUR.')] WMH19030415-V01-15-page2.txt: [('IMPRES-', 'IMPRES'), ('corn-', 'corn')] WMH19030415-V01-15-page3.txt: [('-', ''), ('-air', 'air')] WMH19030415-V01-15-page4.txt: [('HER-', 'HER')] WMH19030506-V01-18-page1.txt: [('COM-', 'COM')] WMH19030506-V01-18-page2.txt: [('cul-', 'cul')] WMH19030506-V01-18-page4.txt: [('-', ''), ('meet-', 'meet'), ('Les-', 'Les'), ('conver-', 'conver'), ('forgive-', 'forgive'), ('sub-', 'sub'), ('HER-', 'HER')] WMH19030513-V01-19-page2.txt: [('-formidable', 'formidable')] WMH19030520-V01-20-page1.txt: [('-', '')] WMH19030520-V01-20-page2.txt: [('-Drpartment', 'Drpartment'), ('--No.', '-No.')] WMH19030520-V01-20-page3.txt: [('temperature-', 'temperature'), ('-', ''), ('temperature-', 'temperature'), ('-', ''), ('applications-', 'applications')] WMH19030520-V01-20-page4.txt: [('Sand-', 'Sand')] WMH19030527-V01-21-page1.txt: [('-', ''), ('-', ''), ('Heb-', 'Heb'), ('with-', 'with')] WMH19030527-V01-21-page2.txt: [('--such', '-such'), ('faith-', 'faith')] WMH19030527-V01-21-page3.txt: [('BAND-', 'BAND'), ('pun-', 'pun'), ('-', ''), ('ali-', 'ali'), ('rep-', 'rep')] WMH19030603-V01-22-page1.txt: [('TES-', 'TES')] WMH19030603-V01-22-page2.txt: [('SAB-', 'SAB'), ('CON-', 'CON')] WMH19030603-V01-22-page3.txt: [('----', '---'), ('AB-', 'AB'), ('-', ''), ('physi-', 'physi')] WMH19030603-V01-22-page4.txt: [('-', '')] WMH19030610-V01-23-page1.txt: [('-', ''), ('-', '')] WMH19030610-V01-23-page3.txt: [('-', ''), ('-', ''), ('reason-', 'reason')] WMH19030624-V01-25-page1.txt: [('DEpART-', 'DEpART'), ('-', '')] WMH19030624-V01-25-page3.txt: [('DETERIORA-', 'DETERIORA')] WMH19030624-V01-25-page4.txt: [('Endeavor.-', 'Endeavor.'), ('Mc-', 'Mc')] WMH19030701-V01-26-page1.txt: [('-', ''), ('any.-', 'any.'), ('DEPART-', 'DEPART')] WMH19030701-V01-26-page3.txt: [('-', ''), ('-', ''), ('distribu-', 'distribu'), ('per-', 'per')] WMH19030701-V01-26-page4.txt: [('-', '')] WMH19030708-V01-27-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19030708-V01-27-page2.txt: [('-', ''), ('-', '')] WMH19030708-V01-27-page3.txt: [('pro-', 'pro')] WMH19030708-V01-27-page4.txt: [('Le-', 'Le'), ('-', '')] WMH19030715-V01-28-page1.txt: [('-', '')] WMH19030715-V01-28-page2.txt: [('-chool', 'chool')] WMH19030715-V01-28-page3.txt: [('-IN', 'IN')] WMH19030715-V01-28-page4.txt: [('HER-', 'HER'), ('non-un-', 'non-un')] WMH19030722-V01-29-page1.txt: [('-', '')] WMH19030722-V01-29-page2.txt: [('assem-', 'assem')] WMH19030722-V01-29-page3.txt: [('-', ''), ('-', '')] WMH19030722-V01-29-page4.txt: [('cents-', 'cents'), ('-Elder', 'Elder')] WMH19030930-V01-39-page3.txt: [('in-', 'in'), ('-to', 'to')] WMH19030930-V01-39-page4.txt: [('-', ''), ('-', ''), ('-revived.', 'revived.')] WMH19031028-V01-43-page1.txt: [('-the', 'the'), ('corn-', 'corn'), ('-and', 'and'), ('be-', 'be'), ('-into', 'into')] WMH19031028-V01-43-page4.txt: [('-', ''), ('-judgment', 'judgment'), ('-', '')] WMH19031118-V01-46-page1.txt: [('-', ''), ('-', ''), ('mission--', 'mission-')] WMH19031118-V01-46-page4.txt: [('-the', 'the'), ('-', '')] WMH19040106-V02-02-page1.txt: [('every-', 'every'), ('peo-', 'peo'), ('-', ''), ('cor-', 'cor')] WMH19040106-V02-02-page2.txt: [('indi-', 'indi')] WMH19040106-V02-02-page3.txt: [('-such', 'such')] WMH19040106-V02-02-page4.txt: [('--Prof.', '-Prof.'), ('Wag-', 'Wag'), ('"Work-', '"Work')] WMH19040113-V02-03-page1.txt: [('-', ''), ('Van-', 'Van'), ('camp-meet-', 'camp-meet'), ('now-', 'now'), ('conven-', 'conven')] WMH19040113-V02-03-page2.txt: [('God--', 'God-')] WMH19040113-V02-03-page3.txt: [('AC-', 'AC'), ('-', '')] WMH19040113-V02-03-page4.txt: [('--A', '-A')] WMH19040127-V02-04-page1.txt: [('PRO-', 'PRO'), ('continu-', 'continu')] WMH19040127-V02-04-page2.txt: [('as-', 'as')] WMH19040127-V02-04-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19040127-V02-04-page4.txt: [('church.-', 'church.'), ('improve-', 'improve'), ('-A', 'A')] WMH19040203-V02-05-page3.txt: [('GENER-', 'GENER')] WMH19040203-V02-05-page4.txt: [('INSTRUCT-', 'INSTRUCT'), ('INSTRUCT-', 'INSTRUCT'), ('-', ''), ('at-', 'at'), ('--"We', '-"We')] WMH19040210-V02-06-page1.txt: [('-', ''), ('-', ''), ('en-', 'en')] WMH19040210-V02-06-page2.txt: [('educa-', 'educa')] WMH19040210-V02-06-page3.txt: [('-', ''), ('Seventh-', 'Seventh'), ('-', '')] WMH19040210-V02-06-page4.txt: [('--In', '-In'), ('San-', 'San'), ('--The', '-The')] WMH19040217-V02-07-page1.txt: [('--sacred', '-sacred'), ('Assn.-', 'Assn.'), ('-', ''), ('President-', 'President')] WMH19040217-V02-07-page3.txt: [('-', ''), ('-', '')] WMH19040217-V02-07-page4.txt: [('--Elder', '-Elder'), ('-will', 'will')] WMH19040224-V02-08-page1.txt: [('President-', 'President'), ('Assn.-', 'Assn.'), ('con-', 'con'), ('priv-', 'priv')] WMH19040224-V02-08-page2.txt: [('-', ''), ('na-', 'na')] WMH19040224-V02-08-page4.txt: [('-', ''), ('--Elder', '-Elder'), ('--The', '-The'), ('-will', 'will'), ('o--', 'o-')] WMH19040302-V02-09-page1.txt: [('-to', 'to')] WMH19040302-V02-09-page3.txt: [('--"the', '-"the'), ('-Dissipating', 'Dissipating')] WMH19040302-V02-09-page4.txt: [('-', ''), ('-an', 'an'), ('-', '')] WMH19040309-V02-10-page1.txt: [('Bat-', 'Bat'), ('-', '')] WMH19040309-V02-10-page2.txt: [('AD-', 'AD'), ('-subscriptions', 'subscriptions')] WMH19040309-V02-10-page3.txt: [('-This', 'This'), ('-', '')] WMH19040309-V02-10-page4.txt: [('--"We', '-"We')] WMH19040316-V02-11-page1.txt: [('righteous-', 'righteous'), ('Savioin-', 'Savioin'), ('Christ.--', 'Christ.-')] WMH19040316-V02-11-page2.txt: [('Berrien-', 'Berrien'), ('be-', 'be')] WMH19040316-V02-11-page3.txt: [('--.', '-.')] WMH19040316-V02-11-page4.txt: [('-', ''), ('-At', 'At'), ('-to', 'to')] WMH19040323-V02-12-page1.txt: [('-F.', 'F.'), ('-the', 'the'), ('-', '')] WMH19040323-V02-12-page2.txt: [('and-', 'and')] WMH19040323-V02-12-page3.txt: [('-taken', 'taken'), ('-', '')] WMH19040323-V02-12-page4.txt: [('Mc-', 'Mc')] WMH19040330-V02-13-page1.txt: [('--', '-'), ('RE-', 'RE'), ('PER-', 'PER')] WMH19040330-V02-13-page2.txt: [('-have', 'have'), ('REC-', 'REC')] WMH19040330-V02-13-page3.txt: [('bili-', 'bili'), ('biliousness.-', 'biliousness.')] WMH19040330-V02-13-page4.txt: [('-be', 'be')] WMH19040406-V02-14-page3.txt: [('In-', 'In'), ('abdom-', 'abdom'), ('-', '')] WMH19040406-V02-14-page4.txt: [('-', '')] WMH19040413-V02-15-page3.txt: [('an-', 'an'), ('-', '')] WMH19040413-V02-15-page4.txt: [('SOUTH-', 'SOUTH'), ('-illustrated.', 'illustrated.'), ('-disposed', 'disposed')] WMH19040420-V02-16-page1.txt: [('Zi-', 'Zi'), ('for-', 'for')] WMH19040420-V02-16-page3.txt: [('-the', 'the'), ('-', ''), ('-DR.', 'DR.'), ('Three-', 'Three')] WMH19040420-V02-16-page4.txt: [('-', ''), ('-', '')] WMH19040427-V02-17-page2.txt: [('The-', 'The'), ('-', '')] WMH19040427-V02-17-page3.txt: [('-', '')] WMH19040427-V02-17-page4.txt: [('Swed-', 'Swed'), ('-', '')] WMH19040504-V02-18-page2.txt: [('-', '')] WMH19040504-V02-18-page3.txt: [('-its', 'its'), ('-of', 'of')] WMH19040504-V02-18-page4.txt: [('-', '')] WMH19040511-V02-19-page2.txt: [('-', ''), ('---of', '--of')] WMH19040511-V02-19-page3.txt: [('-', ''), ('-', ''), ('mail-', 'mail'), ('-', ''), ('-', ''), ('-truths', 'truths'), ('-', ''), ('-Allegan', 'Allegan')] WMH19040511-V02-19-page4.txt: [('-Remember', 'Remember'), ('-', ''), ('be-', 'be')] WMH19040518-V02-20-page2.txt: [('-be', 'be')] WMH19040518-V02-20-page3.txt: [('-note', 'note'), ('-mentioned', 'mentioned')] WMH19040518-V02-20-page4.txt: [('inter.-', 'inter.'), ('-', ''), ('-blessed', 'blessed'), ('-', '')] WMH19040601-V02-22-page1.txt: [('Ohio--', 'Ohio-'), ('-', ''), ('-Irwin', 'Irwin'), ('Andrea-', 'Andrea')] WMH19040601-V02-22-page2.txt: [('-the', 'the')] WMH19040608-V02-23-page1.txt: [('-A.', 'A.'), ('Treasurer-D.-', 'Treasurer-D.'), ('-', ''), ('-', ''), ("-urged'", "urged'")] WMH19040608-V02-23-page2.txt: [('-', ''), ('dis-', 'dis'), ('-', ''), ('-', ''), ('-', ''), ('in-', 'in')] WMH19040608-V02-23-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-God', 'God'), ('-to', 'to'), ('en-', 'en')] WMH19040608-V02-23-page4.txt: [('De-', 'De'), ('-', '')] WMH19040622-V02-24-page3.txt: [('-', ''), ('for-', 'for'), ('-', '')] WMH19040622-V02-24-page4.txt: [('-', ''), ('Some-', 'Some'), ('-', '')] WMH19040629-V02-25-page1.txt: [('-cultivate', 'cultivate'), ('-', ''), ('-', ''), ('or-', 'or')] WMH19040629-V02-25-page2.txt: [('-this', 'this'), ('-be', 'be')] WMH19040629-V02-25-page3.txt: [('-', ''), ('-five.', 'five.')] WMH19040629-V02-25-page4.txt: [('-', '')] WMH19040706-V02-26-page2.txt: [('con-', 'con'), ('attend-', 'attend'), ('Mission--', 'Mission-')] WMH19040706-V02-26-page3.txt: [('--', '-'), ('-', ''), ('-', ''), ('-and', 'and'), ('world-', 'world')] WMH19040713-V02-27-page1.txt: [('-', ''), ('CIRCUM-', 'CIRCUM'), ('ELECT-', 'ELECT'), ('DEFI-', 'DEFI')] WMH19040713-V02-27-page2.txt: [('-ALL', 'ALL')] WMH19040713-V02-27-page3.txt: [('HERALD.-', 'HERALD.'), ('-mee', 'mee'), ('-people', 'people'), ('-', ''), ('-whom', 'whom'), ('-', ''), ('par-', 'par')] WMH19040720-V02-28-page1.txt: [('Vox-', 'Vox'), ('-a', 'a'), ('-principles', 'principles')] WMH19040720-V02-28-page3.txt: [('-in', 'in')] WMH19040720-V02-28-page4.txt: [('-', ''), ('Na-', 'Na'), ('-The', 'The'), ('-', ''), ('-', ''), ('announ-', 'announ'), ('---"Our', '--"Our')] WMH19040727-V02-29-page1.txt: [('-', ''), ('straw-', 'straw'), ('-', ''), ('lights-', 'lights'), ('-Righteousness', 'Righteousness')] WMH19040727-V02-29-page2.txt: [('--That', '-That'), ('-A', 'A')] WMH19040727-V02-29-page3.txt: [('-may', 'may'), ('-small', 'small'), ('-', ''), ('--helpful', '-helpful')] WMH19040803-V02-30-page1.txt: [('-the', 'the'), ('-', ''), ('round-', 'round'), ('-On', 'On'), ('-ether', 'ether'), ('purchas-', 'purchas'), ('.-', '.'), ('-not', 'not')] WMH19040803-V02-30-page2.txt: [('-', ''), ('-', ''), ('-of', 'of')] WMH19040803-V02-30-page3.txt: [('ex-', 'ex'), ('-church', 'church'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19040803-V02-30-page4.txt: [('-', ''), ('Van-', 'Van'), ('Van-', 'Van')] WMH19040810-V02-31-page1.txt: [('-finding', 'finding'), ('-great', 'great'), ('South-', 'South'), ('-', ''), ('giv-', 'giv'), ('-', ''), ('peo-', 'peo')] WMH19040810-V02-31-page3.txt: [('-', '')] WMH19040810-V02-31-page4.txt: [('la-', 'la'), ('Sabbath-', 'Sabbath'), ('-', '')] WMH19040817-V02-32-page1.txt: [('-', ''), ('-', ''), ('con-', 'con')] WMH19040817-V02-32-page2.txt: [('-to', 'to'), ('-Dr.', 'Dr.')] WMH19040817-V02-32-page3.txt: [('-', ''), ('-', ''), ('-', '')] WMH19040817-V02-32-page4.txt: [('benefit.-', 'benefit.')] WMH19040831-V02-33-page1.txt: [('THEM-', 'THEM')] WMH19040831-V02-33-page3.txt: [('repair-', 'repair')] WMH19040831-V02-33-page4.txt: [('Howe-', 'Howe'), ('-', '')] WMH19040914-V02-34-page1.txt: [('--', '-'), ('-', ''), ('-', '')] WMH19040914-V02-34-page2.txt: [('--', '-')] WMH19040914-V02-34-page3.txt: [('-and', 'and'), ('-', '')] WMH19040921-V02-34a-page3.txt: [('con-', 'con'), ('-', ''), ('-', '')] WMH19040921-V02-34a-page4.txt: [('II-', 'II')] WMH19040928-V02-35-page1.txt: [('CAMP-', 'CAMP'), ('of-', 'of')] WMH19040928-V02-35-page2.txt: [('-', '')] WMH19040928-V02-35-page3.txt: [('pray-', 'pray')] WMH19040928-V02-35-page4.txt: [('-', ''), ('-Miss', 'Miss'), ('-', ''), ('--', '-')] WMH19041005-V02-36-page1.txt: [('House-to-', 'House-to'), ('librari-', 'librari'), ('"-', '"')] WMH19041005-V02-36-page2.txt: [('-I', 'I'), ('-', '')] WMH19041005-V02-36-page3.txt: [('indications-', 'indications'), ('-', ''), ('THANK-', 'THANK'), ('-have', 'have'), ('-breads', 'breads'), ('-local', 'local')] WMH19041005-V02-36-page4.txt: [('-', '')] WMH19041012-V02-37-page2.txt: [('Ad-', 'Ad'), ('-', ''), ('-', ''), ('at-', 'at')] WMH19041012-V02-37-page4.txt: [('-conference', 'conference')] WMH19041019-V02-38-page1.txt: [('done--', 'done-')] WMH19041019-V02-38-page3.txt: [('ap-', 'ap'), ('perform-', 'perform'), ('-', ''), ('-tends', 'tends')] WMH19041019-V02-38-page4.txt: [('Sabbath--', 'Sabbath-'), ('-Nashville', 'Nashville')] WMH19041026-V02-39-page1.txt: [('disci-', 'disci')] WMH19041026-V02-39-page2.txt: [('EN-', 'EN'), ('PEO-', 'PEO'), ('DISAP-', 'DISAP')] WMH19041026-V02-39-page3.txt: [('PRE-', 'PRE'), ('Me-', 'Me'), ('at-', 'at')] WMH19041026-V02-39-page4.txt: [('in-', 'in'), ('-', ''), ('Haughey-', 'Haughey')] WMH19041102-V02-40-page1.txt: [('Lga-', 'Lga'), ('-West', 'West'), ('En-', 'En'), ('-themselves', 'themselves'), ('-', '')] WMH19041102-V02-40-page2.txt: [('--about', '-about'), ('-this', 'this'), ('-hoped', 'hoped')] WMH19041102-V02-40-page3.txt: [('-', ''), ('-asked', 'asked'), ('connec-', 'connec')] WMH19041102-V02-40-page4.txt: [('Sabbath-', 'Sabbath'), ('-some', 'some'), ('-', '')] WMH19041109-V02-41-page1.txt: [('San-', 'San')] WMH19041109-V02-41-page2.txt: [('non-', 'non'), ('ever-', 'ever')] WMH19041109-V02-41-page4.txt: [('in-', 'in')] WMH19041116-V02-42-page1.txt: [('-', '')] WMH19041116-V02-42-page4.txt: [('-', ''), ('Mc-', 'Mc'), ('Health-', 'Health')] WMH19041123-V02-43-page1.txt: [('connected--', 'connected-'), ('-', ''), ('-is', 'is'), ('-and', 'and'), ('-', '')] WMH19041123-V02-43-page2.txt: [('hun-', 'hun')] WMH19041123-V02-43-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Sabbath-', 'Sabbath'), ('-could', 'could')] WMH19041123-V02-43-page4.txt: [('-', ''), ('-', ''), ('-', '')] WMH19041130-V02-44-page1.txt: [('Mc-', 'Mc'), ('-West', 'West')] WMH19041130-V02-44-page3.txt: [('-', ''), ('discour-', 'discour'), ('NEAT-', 'NEAT'), ('-', ''), ('-', '')] WMH19041130-V02-44-page4.txt: [('io-', 'io'), ('-', '')] WMH19041207-V02-45-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('Ave-', 'Ave')] WMH19041207-V02-45-page2.txt: [('-and', 'and'), ('--No.', '-No.'), ('CHRIST-', 'CHRIST'), ('To-', 'To'), ('Paw-', 'Paw'), ('confer-', 'confer')] WMH19041207-V02-45-page3.txt: [('-', ''), ('-', '')] WMH19041207-V02-45-page4.txt: [('-aged', 'aged'), ('resurrection.-', 'resurrection.')] WMH19041214-V02-46-page1.txt: [('o-', 'o'), ('-', '')] WMH19041214-V02-46-page2.txt: [('-courage', 'courage')] WMH19041214-V02-46-page3.txt: [('-through', 'through'), ('at-', 'at'), ('-', '')] WMH19041214-V02-46-page4.txt: [('-to', 'to'), ('Sab-', 'Sab'), ('an-', 'an'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19041221-V02-47-page1.txt: [('Education-', 'Education'), ('-to', 'to'), ('and-', 'and'), ('-', ''), ('-and', 'and'), ('-grateful', 'grateful')] WMH19041221-V02-47-page2.txt: [('-', ''), ('teach-', 'teach')] WMH19041221-V02-47-page3.txt: [('-church', 'church'), ('WATCH-', 'WATCH'), ('-work', 'work'), ('NEC-', 'NEC'), ('DE-', 'DE'), ('EDI-', 'EDI'), ('REG-', 'REG'), ('-', ''), ('De-', 'De')] WMH19041221-V02-47-page4.txt: [('-', ''), ('-', ''), ('be-', 'be'), ('-', ''), ('WATCH-', 'WATCH'), ('-of', 'of')] WMH19041228-V02-48-page1.txt: [('con-', 'con'), ('-', ''), ('-', ''), ('-during', 'during')] WMH19041228-V02-48-page2.txt: [('-', '')] WMH19041228-V02-48-page3.txt: [('-or', 'or')] WMH19041228-V02-48-page4.txt: [('Sabbath-', 'Sabbath'), ('-', ''), ('-', ''), ('Sabbath-', 'Sabbath')] WMH19050104-V03-01-page1.txt: [('-', ''), ('Con-', 'Con'), ('-', '')] WMH19050104-V03-01-page2.txt: [('Orange-', 'Orange'), ('-', ''), ('-previous', 'previous')] WMH19050104-V03-01-page3.txt: [('--but', '-but'), ('-not', 'not')] WMH19050104-V03-01-page4.txt: [('-', '')] WMH19050111-V03-02-page1.txt: [('o-', 'o')] WMH19050111-V03-02-page2.txt: [('bap-', 'bap'), ('-', ''), ('-', ''), ('-', '')] WMH19050111-V03-02-page3.txt: [('-', ''), ('-', ''), ('-difficult', 'difficult'), ('-and', 'and'), ('-new', 'new')] WMH19050111-V03-02-page4.txt: [('-have', 'have'), ('faith-', 'faith'), ('-ful', 'ful'), ('zo-', 'zo'), ('Sabbath-', 'Sabbath')] WMH19050118-V03-03-page1.txt: [('V-', 'V')] WMH19050118-V03-03-page2.txt: [('-', '')] WMH19050118-V03-03-page4.txt: [('-G.', 'G.'), ('-', '')] WMH19050201-V03-04-page1.txt: [('-', ''), ('-', ''), ('-', '')] WMH19050201-V03-04-page2.txt: [('-', ''), ('-', '')] WMH19050201-V03-04-page4.txt: [("-YOUTH'S", "YOUTH'S"), ('-page', 'page'), ('Mich-', 'Mich'), ('-', ''), ('-', ''), ('-', '')] WMH19050208-V03-05-page1.txt: [('-', ''), ('Cre-', 'Cre')] WMH19050208-V03-05-page2.txt: [('-over', 'over'), ('corn-', 'corn')] WMH19050208-V03-05-page3.txt: [('-to', 'to')] WMH19050208-V03-05-page4.txt: [('-some', 'some')] WMH19050215-V03-06-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19050215-V03-06-page3.txt: [('-', '')] WMH19050215-V03-06-page4.txt: [('-for', 'for'), ('-', '')] WMH19050222-V03-07-page1.txt: [('-', ''), ('.-', '.'), ('reports-', 'reports'), ('-', ''), ('-', '')] WMH19050222-V03-07-page2.txt: [('-', ''), ('-', '')] WMH19050222-V03-07-page3.txt: [('-', ''), ('-', ''), ('-We', 'We'), ('CAN-', 'CAN')] WMH19050222-V03-07-page4.txt: [('con-', 'con')] WMH19050301-V03-08-page1.txt: [('W.-', 'W.'), ('Mc-', 'Mc')] WMH19050301-V03-08-page4.txt: [('-', '')] WMH19050315-V03-10-page1.txt: [('-', ''), ('-', ''), ('-sending', 'sending'), ('De-', 'De')] WMH19050315-V03-10-page3.txt: [('-proclaim', 'proclaim'), ('-and', 'and')] WMH19050315-V03-10-page4.txt: [('agnos-', 'agnos'), ('ordi-', 'ordi')] WMH19050322-V03-11-page1.txt: [('-HERALD.', 'HERALD.')] WMH19050322-V03-11-page2.txt: [('-the', 'the'), ('-it', 'it'), ('-incident', 'incident'), ('at-', 'at')] WMH19050322-V03-11-page3.txt: [('-oldest', 'oldest'), ('-became', 'became'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('hall-', 'hall'), ('neces-', 'neces')] WMH19050322-V03-11-page5.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('C-', 'C'), ('G-', 'G'), ('-', '')] WMH19050322-V03-11-page6.txt: [('-', ''), ('-', '')] WMH19050329-V03-12-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Confer-', 'Confer'), ('-', ''), ('Depart-', 'Depart')] WMH19050329-V03-12-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19050329-V03-12-page4.txt: [('-', ''), ('-"The', '"The')] WMH19050405-V03-13-page1.txt: [('I--', 'I-'), ('-', ''), ('CON-', 'CON')] WMH19050405-V03-13-page4.txt: [('-', '')] WMH19050413-V03-14-page1.txt: [('-', '')] WMH19050413-V03-14-page2.txt: [('-', '')] WMH19050413-V03-14-page3.txt: [('GIV-', 'GIV'), ('-disciplined', 'disciplined')] WMH19050419-V03-15-page1.txt: [('-', ''), ('-', ''), ('confer-', 'confer')] WMH19050419-V03-15-page3.txt: [('So-', 'So')] WMH19050419-V03-15-page4.txt: [('-Dr.', 'Dr.')] WMH19050426-V03-16-page1.txt: [('-', ''), ('Roth-', 'Roth')] WMH19050426-V03-16-page2.txt: [('-', ''), ('-', ''), ('of-', 'of')] WMH19050426-V03-16-page3.txt: [('-knees', 'knees')] WMH19050426-V03-16-page4.txt: [('cur-', 'cur'), ('-', ''), ('type-', 'type')] WMH19050503-V03-17-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('UTTER-', 'UTTER'), ('Ob-', 'Ob')] WMH19050503-V03-17-page2.txt: [('activity-', 'activity'), ('in-', 'in')] WMH19050503-V03-17-page3.txt: [('-THE', 'THE'), ('-well-officered', 'well-officered'), ('-and', 'and'), ('Pennsyl-', 'Pennsyl')] WMH19050503-V03-17-page4.txt: [('-', '')] WMH19050510-V03-18-page1.txt: [('di-', 'di'), ('-of', 'of'), ('teach-', 'teach')] WMH19050510-V03-18-page2.txt: [('-', '')] WMH19050510-V03-18-page4.txt: [('Seventh-', 'Seventh')] WMH19050517-V03-19-page1.txt: [('lead-', 'lead'), ('-', '')] WMH19050517-V03-19-page3.txt: [('-of', 'of')] WMH19050517-V03-19-page4.txt: [('Confer-', 'Confer'), ('-ence', 'ence')] WMH19050524-V03-20-page2.txt: [('call-', 'call')] WMH19050524-V03-20-page3.txt: [('-possession.', 'possession.')] WMH19050524-V03-20-page4.txt: [('-', '')] WMH19050531-V03-21-page1.txt: [('-', '')] WMH19050531-V03-21-page3.txt: [('MICH-', 'MICH'), ('-"Missionary', '"Missionary'), ('-be', 'be')] WMH19050531-V03-21-page4.txt: [('-', '')] WMH19050607-V03-22-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--live', '-live')] WMH19050607-V03-22-page3.txt: [('-and', 'and'), ('-Encyclopedia', 'Encyclopedia'), ('-Luke', 'Luke'), ('-Isa.', 'Isa.'), ('-', ''), ('-', ''), ('-Matt.', 'Matt.'), ('-Jews', 'Jews'), ('--Heb.', '-Heb.'), ('-Rev.', 'Rev.'), ('-Isa.', 'Isa.'), ('-Neh.', 'Neh.'), ('-Ex.', 'Ex.'), ('-', ''), ('---Gen.', '--Gen.'), ('-', ''), ('--Gen.', '-Gen.'), ('-', ''), ('-', ''), ('-', ''), ('-Isa.', 'Isa.'), ('-', '')] WMH19050607-V03-22-page4.txt: [('-', '')] WMH19050614-V03-23-page2.txt: [('---"I', '--"I'), ('-', '')] WMH19050614-V03-23-page3.txt: [('les-', 'les')] WMH19050614-V03-23-page4.txt: [('-', ''), ('quar-', 'quar')] WMH19050621-V03-24-page1.txt: [('-', ''), ('edu-', 'edu')] WMH19050621-V03-24-page2.txt: [('-may', 'may'), ('-', ''), ('-', '')] WMH19050621-V03-24-page3.txt: [('-', '')] WMH19050621-V03-24-page4.txt: [('-', ''), ('-', '')] WMH19050628-V03-25-page1.txt: [('-', ''), ('con-', 'con'), ('-', '')] WMH19050705-V03-26-page1.txt: [('-s', 's'), ('a-', 'a'), ('--Selected.', '-Selected.'), ('Un-', 'Un')] WMH19050705-V03-26-page2.txt: [('-by', 'by'), ('-', '')] WMH19050705-V03-26-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-in', 'in'), ('gain-', 'gain'), ('-', '')] WMH19050712-V03-27-page1.txt: [('re-', 're')] WMH19050712-V03-27-page2.txt: [('Self-', 'Self'), ('defi-', 'defi')] WMH19050712-V03-27-page3.txt: [('month-', 'month'), ('superintend-', 'superintend'), ('--Selected.', '-Selected.'), ('-', ''), ('-', ''), ('Sabbath-', 'Sabbath'), ('ef-', 'ef')] WMH19050712-V03-27-page4.txt: [('-pain', 'pain')] WMH19050719-V03-28-page1.txt: [('-', ''), ('arrang-', 'arrang')] WMH19050719-V03-28-page3.txt: [('-We', 'We')] WMH19050719-V03-28-page4.txt: [('-', '')] WMH19050726-V03-29-page1.txt: [('-with', 'with'), ('CAMP-', 'CAMP'), ('-', ''), ('-', ''), ('-', '')] WMH19050726-V03-29-page3.txt: [('being--', 'being-'), ('-to', 'to'), ('-', ''), ('-', '')] WMH19050726-V03-29-page4.txt: [('-', ''), ('-', '')] WMH19050802-V03-30-page1.txt: [('Camp-', 'Camp'), ('---health', '--health'), ('--for', '-for'), ('Camp-', 'Camp'), ('-', '')] WMH19050802-V03-30-page2.txt: [('asked-', 'asked')] WMH19050802-V03-30-page3.txt: [('Sabbath-', 'Sabbath'), ('--Selected.', '-Selected.')] WMH19050802-V03-30-page4.txt: [('corn-', 'corn'), ('-', ''), ('Organiza-', 'Organiza')] WMH19050809-V03-31-page1.txt: [('-', ''), ('--', '-'), ('Camp-', 'Camp'), ('-', '')] WMH19050809-V03-31-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19050809-V03-31-page3.txt: [('-', ''), ('neces-', 'neces'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19050816-V03-32-page1.txt: [('inter-', 'inter'), ('-', '')] WMH19050816-V03-32-page2.txt: [('fol-', 'fol'), ('them-', 'them')] WMH19050816-V03-32-page3.txt: [('-Foster', 'Foster')] WMH19050816-V03-32-page4.txt: [('-EZRA', 'EZRA')] WMH19050830-V03-33-page2.txt: [('-', '')] WMH19050830-V03-33-page3.txt: [('-', ''), ('-cents', 'cents'), ('-', ''), ('-', '')] WMH19050830-V03-33-page4.txt: [('-to', 'to'), ('-book', 'book'), ('HER-', 'HER')] WMH19050906-V03-34-page1.txt: [('Sabbath-', 'Sabbath')] WMH19050906-V03-34-page2.txt: [('DEPART-', 'DEPART')] WMH19050906-V03-34-page3.txt: [('--Selected.', '-Selected.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Camp-meet-', 'Camp-meet')] WMH19050906-V03-34-page4.txt: [('-', ''), ('-at', 'at'), ('-', ''), ('work.-', 'work.'), ('-', ''), ('-', '')] WMH19050913-V03-35-page1.txt: [('what-', 'what')] WMH19050913-V03-35-page3.txt: [('--', '-'), ('under-', 'under')] WMH19050913-V03-35-page4.txt: [('-', ''), ('-', '')] WMH19050920-V03-36-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19050920-V03-36-page2.txt: [('Sabbath-', 'Sabbath')] WMH19050920-V03-36-page3.txt: [('-purpose', 'purpose')] WMH19050920-V03-36-page4.txt: [('-', ''), ('-son', 'son'), ('REVIEW--', 'REVIEW-'), ('SIGNS-', 'SIGNS'), ('HEALTH-', 'HEALTH')] WMH19050927-V03-37-page1.txt: [('-', ''), ('-I', 'I')] WMH19050927-V03-37-page2.txt: [('-', ''), ('-low', 'low'), ('im-', 'im')] WMH19050927-V03-37-page3.txt: [('cher-', 'cher')] WMH19051004-V03-38-page1.txt: [('ad-', 'ad'), ('-', ''), ('-', ''), ('-', ''), ('Ad-', 'Ad')] WMH19051004-V03-38-page2.txt: [('-', '')] WMH19051004-V03-38-page3.txt: [('--Selected.', '-Selected.'), ('coun-', 'coun'), ('Yose-', 'Yose')] WMH19051004-V03-38-page4.txt: [('and-', 'and'), ('thepro-', 'thepro')] WMH19051011-V03-39-page1.txt: [('-', '')] WMH19051011-V03-39-page2.txt: [('Sab-', 'Sab')] WMH19051018-V03-40-page1.txt: [('corn-', 'corn')] WMH19051018-V03-40-page2.txt: [('commandments-', 'commandments'), ('-much', 'much')] WMH19051018-V03-40-page3.txt: [('-', ''), ('-', '')] WMH19051018-V03-40-page4.txt: [('--Tarry', '-Tarry'), ('Bourdeau-', 'Bourdeau')] WMH19051025-V03-41-page1.txt: [('-', '')] WMH19051025-V03-41-page3.txt: [('--Selected.', '-Selected.'), ('pro-', 'pro')] WMH19051025-V03-41-page4.txt: [('--also', '-also')] WMH19051101-V03-42-page1.txt: [('future--', 'future-'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19051101-V03-42-page2.txt: [('Ad-', 'Ad')] WMH19051101-V03-42-page3.txt: [('essential-', 'essential'), ('suf-', 'suf')] WMH19051101-V03-42-page4.txt: [('-will', 'will'), ('MESSEN-', 'MESSEN'), ('-and', 'and'), ('-', '')] WMH19051108-V03-43-page1.txt: [('domi-', 'domi'), ('-', ''), ('-', '')] WMH19051108-V03-43-page3.txt: [('Mc-', 'Mc'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-Maple', 'Maple')] WMH19051108-V03-43-page4.txt: [('-of', 'of'), ('-', ''), ('-Editor', 'Editor')] WMH19051122-V03-45-page1.txt: [('-', ''), ('-', ''), ('-"I\'ll', '"I\'ll'), ('-', ''), ('-', ''), ('A-', 'A'), ('-lambs', 'lambs'), ('Advent-', 'Advent')] WMH19051122-V03-45-page2.txt: [('-', '')] WMH19051122-V03-45-page4.txt: [('-', ''), ('-', ''), ('Broth-', 'Broth')] WMH19051129-V03-46-page1.txt: [('-weary', 'weary'), ('sys-', 'sys')] WMH19051129-V03-46-page2.txt: [('-', ''), ('in-', 'in')] WMH19051129-V03-46-page3.txt: [('-this', 'this'), ('-', ''), ('-', '')] WMH19051129-V03-46-page4.txt: [('LIT-', 'LIT'), ('LIT-', 'LIT')] WMH19051206-V03-47-page1.txt: [('-our', 'our'), ('Cedar-', 'Cedar'), ('De-', 'De'), ('-', ''), ('--', '-'), ('interest-', 'interest')] WMH19051206-V03-47-page2.txt: [('-', '')] WMH19051206-V03-47-page3.txt: [('-fruit', 'fruit')] WMH19051206-V03-47-page4.txt: [('ad-', 'ad'), ('-', '')] WMH19051213-V03-48-page1.txt: [('recitation.-', 'recitation.')] WMH19051213-V03-48-page2.txt: [('can-', 'can'), ('POT-', 'POT')] WMH19051213-V03-48-page3.txt: [('-', ''), ('-', '')] WMH19051213-V03-48-page4.txt: [('copies.-', 'copies.'), ('-Creek', 'Creek'), ('var-', 'var')] WMH19051220-V03-49-page1.txt: [('right-', 'right'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19051220-V03-49-page2.txt: [('ADVENTIST-', 'ADVENTIST'), ('CO-', 'CO'), ('-', ''), ('Ix-', 'Ix')] WMH19051220-V03-49-page3.txt: [('-', ''), ('-', '')] WMH19051227-V03-50-page2.txt: [('-', ''), ("People's-", "People's"), ('-given', 'given'), ('-etc.', 'etc.'), ('-', '')] WMH19051227-V03-50-page3.txt: [('-', ''), ('-Although', 'Although'), ('o-', 'o'), ('-', ''), ('Her-', 'Her')] WMH19060103-V04-01-page1.txt: [('consider-', 'consider'), ('ask-', 'ask'), ('-.', '.')] WMH19060103-V04-01-page2.txt: [('years..-', 'years..'), ('sin-', 'sin')] WMH19060103-V04-01-page3.txt: [('Self-', 'Self'), ('birth-', 'birth')] WMH19060103-V04-01-page4.txt: [('-', ''), ('-', '')] WMH19060110-V04-02-page1.txt: [('-fifteen', 'fifteen'), ('-', '')] WMH19060110-V04-02-page2.txt: [('-', ''), ('-Paw', 'Paw'), ('-', ''), ('-', ''), ('-', '')] WMH19060110-V04-02-page4.txt: [('-', '')] WMH19060117-V04-03-page1.txt: [('Secretary--', 'Secretary-'), ('-', '')] WMH19060117-V04-03-page2.txt: [('CONFER-', 'CONFER')] WMH19060117-V04-03-page3.txt: [('-loans', 'loans'), ('-', ''), ('R-', 'R'), ('Mich-', 'Mich'), ('-', '')] WMH19060117-V04-03-page4.txt: [('-', ''), ('Offer-', 'Offer'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060124-V04-04-page1.txt: [('--jr', '-jr'), ('--', '-'), ('-', ''), ('-', ''), ('SABBATH-', 'SABBATH'), ('-', '')] WMH19060124-V04-04-page2.txt: [('INCORPO-', 'INCORPO'), ('aggre-', 'aggre')] WMH19060124-V04-04-page3.txt: [('Mc-', 'Mc'), ('-', ''), ('-', ''), ('-', '')] WMH19060124-V04-04-page4.txt: [('-', ''), ('-', '')] WMH19060131-V04-05-page1.txt: [('------', '-----'), ('-', ''), ('-luessiorpi', 'luessiorpi')] WMH19060131-V04-05-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('Mis-', 'Mis')] WMH19060131-V04-05-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('sub-', 'sub'), ('-', '')] WMH19060207-V04-06-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060207-V04-06-page3.txt: [('Spirit-', 'Spirit'), ('-', ''), ('place.-', 'place.')] WMH19060214-V04-07-page1.txt: [('t-', 't'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060214-V04-07-page2.txt: [('-', ''), ('-Lessons', 'Lessons')] WMH19060214-V04-07-page3.txt: [('-all', 'all'), ('-the', 'the'), ('Danish-', 'Danish')] WMH19060214-V04-07-page4.txt: [('BOOK.-', 'BOOK.'), ('fitthem-', 'fitthem')] WMH19060221-V04-08-page1.txt: [('--', '-'), ("''-is'-", "''-is'"), ('develop-', 'develop'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060221-V04-08-page2.txt: [('-', ''), ('adopt-', 'adopt')] WMH19060221-V04-08-page3.txt: [('themselves-', 'themselves')] WMH19060221-V04-08-page4.txt: [('-were', 'were')] WMH19060228-V04-09-page1.txt: [('in-', 'in'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060228-V04-09-page2.txt: [('-', '')] WMH19060228-V04-09-page3.txt: [('-', ''), ('me-', 'me')] WMH19060307-V04-10-page1.txt: [('-', ''), ('Vice-', 'Vice'), ('-', ''), ('-', '')] WMH19060307-V04-10-page2.txt: [('--the', '-the'), ('-', '')] WMH19060307-V04-10-page3.txt: [('-', ''), ('-theory', 'theory')] WMH19060307-V04-10-page4.txt: [('in-', 'in'), ('-', ''), ('-', ''), ('-', '')] WMH19060314-V04-11-page1.txt: [('-', ''), ('con-', 'con'), ('-', ''), ('-', '')] WMH19060314-V04-11-page2.txt: [('-', '')] WMH19060314-V04-11-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('sug-', 'sug')] WMH19060314-V04-11-page4.txt: [('way.-', 'way.'), ('MICHI-', 'MICHI'), ('-', '')] WMH19060321-V04-12-page1.txt: [('con-', 'con'), ('-', '')] WMH19060321-V04-12-page2.txt: [('HER-', 'HER')] WMH19060321-V04-12-page3.txt: [('spelling--', 'spelling-'), ('-future', 'future'), ('San-', 'San')] WMH19060321-V04-12-page4.txt: [('-Will', 'Will')] WMH19060328-V04-13-page1.txt: [('--', '-'), ('GATHERETI-', 'GATHERETI'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('sup-', 'sup')] WMH19060328-V04-13-page2.txt: [('-', ''), ('-', ''), ('-', '')] WMH19060328-V04-13-page3.txt: [('-', ''), ('under-', 'under'), ('receiver-', 'receiver')] WMH19060328-V04-13-page4.txt: [('--', '-'), ('-', ''), ('-', ''), ('-', '')] WMH19060404-V04-14-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060404-V04-14-page3.txt: [('-Uttered', 'Uttered'), ('--', '-')] WMH19060404-V04-14-page4.txt: [('-', '')] WMH19060411-V04-15-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Conven-', 'Conven')] WMH19060411-V04-15-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Offer-', 'Offer'), ('-', '')] WMH19060411-V04-15-page3.txt: [('-', ''), ('-ro', 'ro')] WMH19060411-V04-15-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060418-V04-16-page1.txt: [('GATHERETI-', 'GATHERETI'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('---the', '--the'), ('-', ''), ('-', ''), ('work-', 'work')] WMH19060418-V04-16-page3.txt: [('-', ''), ('-it', 'it')] WMH19060418-V04-16-page4.txt: [('Hunts-', 'Hunts'), ('-', '')] WMH19060425-V04-17-page1.txt: [('-in', 'in'), ('build-', 'build'), ('-', '')] WMH19060425-V04-17-page2.txt: [('--FLORENCE', '-FLORENCE'), ('right-', 'right')] WMH19060425-V04-17-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('SABBATH-', 'SABBATH')] WMH19060425-V04-17-page4.txt: [('AD-', 'AD'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060502-V04-18-page1.txt: [('-', ''), ('suf-', 'suf'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-')] WMH19060502-V04-18-page3.txt: [('excel-', 'excel'), ('-lent', 'lent'), ('ARBEI-', 'ARBEI'), ('arrange-', 'arrange'), ('in-', 'in')] WMH19060502-V04-18-page4.txt: [('-to', 'to'), ('-church', 'church')] WMH19060509-V04-19-page1.txt: [('-', ''), ('GATHERED-', 'GATHERED'), ('"---', '"--')] WMH19060509-V04-19-page2.txt: [('-I', 'I'), ('-', ''), ('-', ''), ('-.God.', '.God.'), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('PROPH-', 'PROPH'), ('PROPH-', 'PROPH'), ('PROPH-', 'PROPH'), ('Lakeview-', 'Lakeview')] WMH19060509-V04-19-page3.txt: [('-', ''), ('-', ''), ('Rogers-', 'Rogers'), ('-', ''), ('-ho', 'ho'), ('near-', 'near'), ('be-', 'be'), ('mat-', 'mat'), ('-', '')] WMH19060523-V04-20-page2.txt: [('-', ''), ('-man', 'man')] WMH19060523-V04-20-page3.txt: [('-', ''), ('-', '')] WMH19060530-V04-21-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-school', 'school'), ('-', ''), ('-', ''), ('-', '')] WMH19060530-V04-21-page2.txt: [('-', ''), ('-', ''), ('denomi-', 'denomi')] WMH19060530-V04-21-page3.txt: [('im-', 'im')] WMH19060530-V04-21-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060606-V04-22-page1.txt: [('..-', '..')] WMH19060606-V04-22-page2.txt: [('-', '')] WMH19060606-V04-22-page3.txt: [('-Sec.', 'Sec.'), ('.-', '.'), ('SUP-', 'SUP'), ('publishers.compliment-', 'publishers.compliment')] WMH19060606-V04-22-page4.txt: [('-Literary', 'Literary'), ('Michi-', 'Michi'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060613-V04-23-page1.txt: [('-Sabbath-school', 'Sabbath-school'), ('-Kalamazoo', 'Kalamazoo')] WMH19060613-V04-23-page3.txt: [('-', ''), ('-', ''), ('mail-', 'mail'), ('-', '')] WMH19060613-V04-23-page4.txt: [('ES-', 'ES'), ('Missis.-', 'Missis.'), ('-', '')] WMH19060620-V04-24-page1.txt: [('-', ''), ('-', ''), ('-utmost', 'utmost')] WMH19060620-V04-24-page2.txt: [('REPENT-', 'REPENT'), ('RE-', 'RE'), ('REPENT-', 'REPENT'), ('-', '')] WMH19060620-V04-24-page3.txt: [('-obedience', 'obedience'), ('-', '')] WMH19060620-V04-24-page4.txt: [('returning-', 'returning'), ('ut-', 'ut')] WMH19060627-V04-25-page1.txt: [('viz.--', 'viz.-'), ('-four-page', 'four-page'), ('-', ''), ('Im-', 'Im')] WMH19060627-V04-25-page2.txt: [('-RST', 'RST'), ('sympaths-', 'sympaths')] WMH19060627-V04-25-page4.txt: [('-', '')] WMH19060704-V04-26-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('nee-', 'nee')] WMH19060704-V04-26-page2.txt: [('-the', 'the'), ('some-', 'some')] WMH19060704-V04-26-page3.txt: [('per-', 'per'), ('--Success.', '-Success.')] WMH19060704-V04-26-page4.txt: [('-', '')] WMH19060711-V04-27-page1.txt: [('in-', 'in')] WMH19060711-V04-27-page2.txt: [('-', ''), ('Offerings-', 'Offerings'), ('-', '')] WMH19060711-V04-27-page3.txt: [('-', ''), ('-', ''), ('-', '')] WMH19060711-V04-27-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('employ-', 'employ'), ('Contro-', 'Contro')] WMH19060718-V04-28-page1.txt: [('-', ''), ('lir-', 'lir'), ('-', ''), ('-school', 'school'), ('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-', '')] WMH19060718-V04-28-page2.txt: [('-', ''), ('ends--', 'ends-'), ('sup-', 'sup')] WMH19060718-V04-28-page3.txt: [('Con-', 'Con')] WMH19060718-V04-28-page4.txt: [('--', '-'), ('Pennsyl-', 'Pennsyl')] WMH19060725-V04-29-page1.txt: [('-', ''), ('REAPETHGATHRETI-', 'REAPETHGATHRETI'), ('\'"--', '\'"-')] WMH19060725-V04-29-page2.txt: [('es-', 'es'), ('re-', 're')] WMH19060725-V04-29-page3.txt: [('COL-', 'COL')] WMH19060725-V04-29-page4.txt: [('-', ''), ('Ed-', 'Ed'), ('Healing--', 'Healing-'), ('"Left-', '"Left'), ('Safe-', 'Safe')] WMH19060801-V04-30-page1.txt: [('-', '')] WMH19060801-V04-30-page2.txt: [('us-', 'us')] WMH19060801-V04-30-page3.txt: [('"A"-', '"A"')] WMH19060808-V04-31-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('APPRO-', 'APPRO'), ('CAN-', 'CAN'), ('REA-', 'REA')] WMH19060808-V04-31-page3.txt: [('I-', 'I'), ('-', ''), ('re-', 're')] WMH19060808-V04-31-page4.txt: [('.-', '.'), ('con-', 'con'), ('-followers', 'followers'), ('-', '')] WMH19060822-V04-32-page1.txt: [('a-', 'a'), ('r-', 'r'), ('-', ''), ('-study', 'study'), ('-', '')] WMH19060822-V04-32-page2.txt: [('-', '')] WMH19060822-V04-32-page3.txt: [('-', ''), ('-', ''), ('to-day--', 'to-day-'), ('political--', 'political-')] WMH19060822-V04-32-page4.txt: [('-had', 'had')] WMH19060829-V04-33-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060829-V04-33-page2.txt: [('-the', 'the'), ('-one', 'one'), ('-', '')] WMH19060829-V04-33-page3.txt: [('-', ''), ('-', '')] WMH19060829-V04-33-page4.txt: [('-', '')] WMH19060905-V04-34-page1.txt: [('-', ''), ('--Isaac', '-Isaac'), ('-', ''), ('so.-', 'so.')] WMH19060905-V04-34-page2.txt: [('SPE-', 'SPE'), ('-', '')] WMH19060905-V04-34-page3.txt: [('lle-', 'lle'), ('Confer-', 'Confer')] WMH19060912-V04-35-page1.txt: [('pro-', 'pro'), ('-', '')] WMH19060912-V04-35-page2.txt: [('-', ''), ('-six', 'six')] WMH19060912-V04-35-page3.txt: [('re-', 're'), ('-Children', 'Children')] WMH19060919-V04-36-page1.txt: [('-', ''), ('-by', 'by'), ('-', ''), ('-heart', 'heart'), ('-Exclaims', 'Exclaims'), ('uncorrupti-', 'uncorrupti')] WMH19060919-V04-36-page2.txt: [('-', ''), ('-', ''), ('mem-', 'mem')] WMH19060919-V04-36-page3.txt: [('-breaking', 'breaking')] WMH19060919-V04-36-page4.txt: [('-and', 'and'), ('-', '')] WMH19060926-V04-37-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('o-', 'o'), ('-', ''), ('-', ''), ('Mes-', 'Mes'), ('-', ''), ('-', ''), ('-', '')] WMH19060926-V04-37-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19060926-V04-37-page3.txt: [('schools.--', 'schools.-')] WMH19060926-V04-37-page4.txt: [('"sulpherbag-', '"sulpherbag'), ('exalt-', 'exalt')] WMH19061003-V04-38-page1.txt: [('-', ''), ('----', '---'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.')] WMH19061003-V04-38-page4.txt: [('-', ''), ('-Sam', 'Sam'), ('-', '')] WMH19061010-V04-39-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19061010-V04-39-page2.txt: [('-', ''), ('-', '')] WMH19061010-V04-39-page3.txt: [('possible-', 'possible')] WMH19061017-V04-40-page1.txt: [('EAST-', 'EAST'), ('SEND-', 'SEND'), ('TAK-', 'TAK'), ('-', ''), ('--', '-'), ('GATf.-', 'GATf.'), ('-', ''), ('-', ''), ('-', ''), ('con-', 'con')] WMH19061017-V04-40-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Pub-', 'Pub'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('im-', 'im'), ('every-', 'every'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-variety', 'variety'), ('ad-', 'ad'), ('how-', 'how'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19061017-V04-40-page3.txt: [('intelligently-', 'intelligently'), ('es-', 'es'), ('SWAHN-', 'SWAHN'), ('sec-', 'sec')] WMH19061017-V04-40-page4.txt: [('RyDER-', 'RyDER'), ('-with', 'with')] WMH19061024-V04-41-page1.txt: [('-', ''), ('-', ''), ('-', '')] WMH19061024-V04-41-page3.txt: [('--Education.', '-Education.')] WMH19061024-V04-41-page4.txt: [('-the', 'the')] WMH19061031-V04-42-page1.txt: [('-wholly', 'wholly'), ('pur-', 'pur'), ('-', ''), ('-', ''), ('-', ''), ('INTERNA-', 'INTERNA'), ('INTER-', 'INTER')] WMH19061031-V04-42-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('P-', 'P'), ('-', '')] WMH19061031-V04-42-page3.txt: [('de-', 'de'), ('--', '-'), ('-----', '----'), ('hav-', 'hav')] WMH19061107-V04-43-page1.txt: [('-', ''), ('-', ''), ('--This', '-This'), ('--', '-')] WMH19061107-V04-43-page2.txt: [('---I', '--I')] WMH19061107-V04-43-page3.txt: [('Gener-', 'Gener')] WMH19061107-V04-43-page4.txt: [('-', '')] WMH19061114-V04-44-page1.txt: [('-', ''), ('-', ''), ('--upmuscle', '-upmuscle'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--I', '-I')] WMH19061114-V04-44-page2.txt: [('--I', '-I'), ('child-', 'child')] WMH19061114-V04-44-page3.txt: [('submerg-', 'submerg')] WMH19061114-V04-44-page4.txt: [('-the', 'the'), ('De-', 'De'), ('-', ''), ('-', '')] WMH19061121-V04-45-page1.txt: [('-', ''), ('-that', 'that'), ('--I', '-I'), ('--labored', '-labored'), ('--Well', '-Well'), ('-', '')] WMH19061121-V04-45-page2.txt: [('--Chr', '-Chr')] WMH19061121-V04-45-page3.txt: [('-to', 'to'), ('doubt-', 'doubt'), ('in-', 'in')] WMH19061121-V04-45-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19061128-V04-46-page1.txt: [('qmir-', 'qmir'), ('-', ''), ('work.-', 'work.')] WMH19061128-V04-46-page2.txt: [('min-', 'min'), ('dis-', 'dis')] WMH19061128-V04-46-page3.txt: [('re-', 're'), ('-', ''), ('let-', 'let')] WMH19061128-V04-46-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19061205-V04-47-page1.txt: [('--Selected.', '-Selected.'), ('Sunday-', 'Sunday'), ('moun-', 'moun'), ('-', ''), ('-', '')] WMH19061205-V04-47-page2.txt: [('-', '')] WMH19061205-V04-47-page3.txt: [('-', ''), ('-', '')] WMH19061205-V04-47-page4.txt: [('occasion.-', 'occasion.')] WMH19061212-V04-48-page1.txt: [('-', ''), ('-Let', 'Let'), ('-the', 'the'), ('Wednes-', 'Wednes'), ('-', ''), ('-', ''), ('-', ''), ('work-', 'work'), ('-', ''), ('-', '')] WMH19061212-V04-48-page2.txt: [('-', ''), ('AD-', 'AD')] WMH19061212-V04-48-page3.txt: [('mani-', 'mani')] WMH19061212-V04-48-page4.txt: [('-', ''), ('-', '')] WMH19061219-V04-49-page1.txt: [('-', ''), ('-not', 'not'), ('--', '-')] WMH19061219-V04-49-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19061226-V04-50-page1.txt: [('-', ''), ('--', '-'), ('-', ''), ('Sunday-', 'Sunday'), ('-', '')] WMH19061226-V04-50-page3.txt: [('..-', '..'), ('-paper', 'paper'), ('-"Jesus', '"Jesus')] WMH19061226-V04-50-page4.txt: [('--toe', '-toe')] WMH19070102-V05-01-page1.txt: [('-that', 'that'), ('-on', 'on'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070102-V05-01-page2.txt: [('educa-', 'educa'), ('-', ''), ('-and', 'and'), ('Sunday-', 'Sunday')] WMH19070102-V05-01-page4.txt: [('-of', 'of'), ('I-', 'I'), ('-', '')] WMH19070109-V05-02-page1.txt: [('glar--', 'glar-'), ('ATI-', 'ATI'), ('-', '')] WMH19070109-V05-02-page2.txt: [('the-', 'the'), ('Pres.-', 'Pres.')] WMH19070109-V05-02-page3.txt: [('outpeo-', 'outpeo')] WMH19070109-V05-02-page4.txt: [('-', ''), ('-', ''), ('De-', 'De'), ('-', '')] WMH19070116-V05-03-page1.txt: [('-.', '.'), ('-', ''), ('-', ''), ('-take', 'take'), ('-the', 'the'), ('-come', 'come')] WMH19070116-V05-03-page2.txt: [('or-', 'or'), ('-der', 'der')] WMH19070116-V05-03-page3.txt: [('Fiske-', 'Fiske')] WMH19070123-V05-04-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('W.J-', 'W.J'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070123-V05-04-page2.txt: [('-simple', 'simple'), ('ex-', 'ex'), ('-', '')] WMH19070123-V05-04-page3.txt: [('-', ''), ('con-', 'con'), ('-of', 'of')] WMH19070123-V05-04-page4.txt: [('opposi-', 'opposi'), ('-shed', 'shed')] WMH19070130-V05-05-page1.txt: [('accord-', 'accord'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070130-V05-05-page4.txt: [('-', ''), ('RE-', 'RE')] WMH19070206-V05-06-page1.txt: [('-', ''), ('----', '---'), ('-would', 'would'), ('camp-meet-', 'camp-meet'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070206-V05-06-page2.txt: [('Mt.-', 'Mt.'), ('-A', 'A'), ('-', ''), ('Barretr"-', 'Barretr"'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070206-V05-06-page3.txt: [('-denomination', 'denomination'), ('-', '')] WMH19070206-V05-06-page4.txt: [('-is', 'is'), ('-similar', 'similar'), ('-', '')] WMH19070213-V05-07-page1.txt: [('ac-', 'ac'), ('Depart-', 'Depart')] WMH19070213-V05-07-page2.txt: [('-', '')] WMH19070213-V05-07-page4.txt: [('.seal-', '.seal'), ('-and', 'and')] WMH19070220-V05-08-page1.txt: [('-to', 'to')] WMH19070220-V05-08-page2.txt: [('OFFER-', 'OFFER'), ('-', '')] WMH19070220-V05-08-page3.txt: [('-', ''), ('-', '')] WMH19070220-V05-08-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070227-V05-09-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('Confer-', 'Confer'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070227-V05-09-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070227-V05-09-page3.txt: [('Dis-', 'Dis'), ('-to', 'to'), ('-colporter', 'colporter'), ('truth.-', 'truth.')] WMH19070227-V05-09-page4.txt: [('HERALD.-', 'HERALD.'), ('-', ''), ('-i', 'i'), ('--', '-'), ('-', ''), ('-', ''), ('-than', 'than'), ('Re-', 'Re')] WMH19070306-V05-10-page1.txt: [('-stand', 'stand')] WMH19070306-V05-10-page2.txt: [('SOLD-', 'SOLD'), ('-', '')] WMH19070306-V05-10-page3.txt: [('LIBER-', 'LIBER')] WMH19070306-V05-10-page4.txt: [('-', ''), ('-', ''), ('-', '')] WMH19070313-V05-11-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-plain', 'plain'), ('--"And', '-"And'), ('--met', '-met')] WMH19070313-V05-11-page2.txt: [('-', '')] WMH19070313-V05-11-page3.txt: [('--practical', '-practical')] WMH19070313-V05-11-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('or-', 'or'), ('Consti-', 'Consti'), ('-', '')] WMH19070320-V05-12-page2.txt: [('-', '')] WMH19070320-V05-12-page3.txt: [('-r', 'r'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070327-V05-13-page3.txt: [('associat-', 'associat'), ('consci-', 'consci'), ('--', '-')] WMH19070327-V05-13-page4.txt: [('--', '-'), ('RE-', 'RE'), ('de-', 'de'), ('-', '')] WMH19070403-V05-14-page1.txt: [('-', ''), ('II-', 'II')] WMH19070403-V05-14-page2.txt: [('-', ''), ('con-', 'con')] WMH19070403-V05-14-page3.txt: [('-know', 'know')] WMH19070403-V05-14-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070410-V05-15-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070410-V05-15-page2.txt: [('--', '-'), ('-', '')] WMH19070410-V05-15-page4.txt: [('-', '')] WMH19070417-V05-16-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('cor-', 'cor'), ('-', ''), ('-', ''), ('REAPETI-', 'REAPETI'), ('be-', 'be')] WMH19070417-V05-16-page2.txt: [('Danish-', 'Danish')] WMH19070417-V05-16-page3.txt: [('-', ''), ('--a', '-a')] WMH19070417-V05-16-page4.txt: [('Pil-', 'Pil')] WMH19070424-V05-17-page1.txt: [('GATHERED-', 'GATHERED'), ('-', ''), ('-', ''), ('corn-', 'corn'), ('-', ''), ('-', ''), ('-', '')] WMH19070424-V05-17-page2.txt: [('the-', 'the'), ('IN-', 'IN'), ('IN-', 'IN'), ('-', '')] WMH19070424-V05-17-page3.txt: [('oforthog-', 'oforthog')] WMH19070424-V05-17-page4.txt: [('-', '')] WMH19070501-V05-18-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('wasevery-', 'wasevery'), ('chap-', 'chap'), ('them-', 'them'), ('-', ''), ('-', ''), ('-', '')] WMH19070501-V05-18-page2.txt: [('confede-', 'confede')] WMH19070501-V05-18-page3.txt: [('devotedcanvas-', 'devotedcanvas')] WMH19070501-V05-18-page4.txt: [('interested-', 'interested'), ('-', '')] WMH19070508-V05-19-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('NOW.-', 'NOW.'), ('plain-', 'plain')] WMH19070508-V05-19-page2.txt: [('-', ''), ('pur-', 'pur')] WMH19070508-V05-19-page3.txt: [('-', ''), ('understand.--', 'understand.-')] WMH19070508-V05-19-page4.txt: [('-', '')] WMH19070515-V05-20-page1.txt: [('-', ''), ('-', ''), ('ac-', 'ac'), ('--', '-')] WMH19070515-V05-20-page3.txt: [('Attor-', 'Attor'), ('-', ''), ('-', ''), ('Church-', 'Church'), ('-----', '----'), ('-', ''), ('-', '')] WMH19070515-V05-20-page4.txt: [('--he', '-he'), ('Conference-', 'Conference'), ('WATCH-', 'WATCH'), ('-', '')] WMH19070522-V05-21-page1.txt: [('-es', 'es'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070522-V05-21-page3.txt: [('-', '')] WMH19070522-V05-21-page4.txt: [('Seventh-', 'Seventh'), ('--praise', '-praise'), ('-', ''), ('-', ''), ('-', ''), ('-more', 'more')] WMH19070529-V05-22-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070529-V05-22-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('af-', 'af')] WMH19070529-V05-22-page3.txt: [('--', '-')] WMH19070529-V05-22-page4.txt: [('-', ''), ('-A.', 'A.'), ('-', ''), ('-', ''), ('-', ''), ('refresh-', 'refresh')] WMH19070605-V05-23-page1.txt: [('--', '-'), ('--every', '-every'), ('-', ''), ('-', ''), ('-', '')] WMH19070605-V05-23-page2.txt: [('waiting--', 'waiting-')] WMH19070605-V05-23-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('pub-', 'pub'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070612-V05-24-page1.txt: [('-intercessor.', 'intercessor.'), ('-', '')] WMH19070612-V05-24-page3.txt: [('-', ''), ('experience--', 'experience-'), ('Sabbath-', 'Sabbath')] WMH19070619-V05-25-page1.txt: [('con-', 'con'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070619-V05-25-page2.txt: [('-', ''), ('--In', '-In'), ('-', ''), ('-', ''), ('-', '')] WMH19070619-V05-25-page3.txt: [('-', ''), ('Sabbath-', 'Sabbath'), ('Sabbath-', 'Sabbath'), ('-of', 'of')] WMH19070619-V05-25-page4.txt: [('-', '')] WMH19070626-V05-26-page1.txt: [('-', ''), ('adorn-', 'adorn'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070626-V05-26-page2.txt: [('--instructs', '-instructs')] WMH19070626-V05-26-page3.txt: [('HER-', 'HER')] WMH19070626-V05-26-page4.txt: [('them-', 'them'), ('--affirmed.', '-affirmed.')] WMH19070703-V05-27-page1.txt: [('-', '')] WMH19070703-V05-27-page4.txt: [('-', ''), ('-', '')] WMH19070710-V05-28-page1.txt: [('GATHERETI-', 'GATHERETI'), ('--', '-'), ('Sabbath-', 'Sabbath'), ('-training.', 'training.'), ('--not', '-not')] WMH19070710-V05-28-page2.txt: [('oc-', 'oc')] WMH19070710-V05-28-page4.txt: [('-', ''), ('Camp-', 'Camp'), ('ques-', 'ques')] WMH19070717-V05-29-page1.txt: [('-', ''), ('-', ''), ('de-', 'de'), ('-----', '----'), ('-school', 'school'), ('-', ''), ('-', ''), ('-', '')] WMH19070717-V05-29-page2.txt: [('FATH-', 'FATH'), ('righteous-', 'righteous')] WMH19070724-V05-30-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('E-', 'E'), ('-.', '.'), ('-', '')] WMH19070724-V05-30-page2.txt: [('MORN-', 'MORN')] WMH19070731-V05-31-page1.txt: [('re-', 're')] WMH19070731-V05-31-page2.txt: [('-', ''), ('-', ''), ('-', '')] WMH19070731-V05-31-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070807-V05-32-page1.txt: [('--', '-'), ('-', ''), ('-"', '"'), ('-notify', 'notify')] WMH19070807-V05-32-page2.txt: [('Camp--', 'Camp-'), ('-', '')] WMH19070807-V05-32-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('--health', '-health'), ('Seventh-', 'Seventh'), ('-', '')] WMH19070807-V05-32-page4.txt: [('-', ''), ('-', ''), ('-', '')] WMH19070814-V05-33-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-dispensation.', 'dispensation.')] WMH19070814-V05-33-page2.txt: [('-', '')] WMH19070814-V05-33-page3.txt: [('mis-', 'mis')] WMH19070814-V05-33-page4.txt: [('-', ''), ('per-', 'per')] WMH19070828-V05-34-page1.txt: [('-', '')] WMH19070828-V05-34-page2.txt: [('-a', 'a'), ('-', ''), ('-', ''), ('prov-', 'prov')] WMH19070828-V05-34-page3.txt: [('-', '')] WMH19070828-V05-34-page4.txt: [('-', ''), ('.ASSOCIA-', '.ASSOCIA'), ('-page', 'page')] WMH19070904-V05-35-page1.txt: [('-', ''), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('GATHERETI-', 'GATHERETI'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070904-V05-35-page2.txt: [('de-', 'de')] WMH19070904-V05-35-page3.txt: [('-', '')] WMH19070911-V05-36-page1.txt: [('-', ''), ('GATHRETI-', 'GATHRETI'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070911-V05-36-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070918-V05-37-page1.txt: [('Dr.-', 'Dr.'), ("'-", "'"), ('GATLiERETI-', 'GATLiERETI')] WMH19070918-V05-37-page2.txt: [('-coming', 'coming'), ('SERIES-', 'SERIES'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070918-V05-37-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19070918-V05-37-page4.txt: [('LAN-', 'LAN'), ('-', ''), ('es-', 'es'), ('-', ''), ('-', ''), ('-', '')] WMH19070925-V05-38-page1.txt: [('re-', 're'), ('hear-', 'hear'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('GATHERETI-', 'GATHERETI')] WMH19070925-V05-38-page3.txt: [('-', ''), ('-', ''), ('---', '--'), ('-Covert', 'Covert')] WMH19070925-V05-38-page4.txt: [('beau-', 'beau')] WMH19071002-V05-39-page1.txt: [('-', ''), ('insti-', 'insti'), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('-', '')] WMH19071002-V05-39-page2.txt: [('-', ''), ('-', '')] WMH19071002-V05-39-page3.txt: [('-on', 'on')] WMH19071009-V05-40-page1.txt: [('-reined', 'reined'), ('mot-', 'mot'), ('-', ''), ('-This', 'This'), ('begin-', 'begin'), ('"-----', '"----'), ('-', ''), ('-', '')] WMH19071009-V05-40-page2.txt: [('par-', 'par')] WMH19071009-V05-40-page3.txt: [('-', ''), ('-', ''), ('.-', '.'), ('-', '')] WMH19071009-V05-40-page4.txt: [('-', ''), ('-Doctor', 'Doctor'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19071016-V05-41-page1.txt: [('-Last', 'Last'), ('con-', 'con'), ('-', ''), ('-', ''), ('-', ''), ('-HE', 'HE'), ('-', ''), ('al-', 'al'), ('para-', 'para'), ('un-', 'un'), ('self-', 'self')] WMH19071016-V05-41-page2.txt: [('en-', 'en'), ('-joy', 'joy'), ('per-', 'per')] WMH19071016-V05-41-page3.txt: [('-a', 'a'), ('GENERA-', 'GENERA')] WMH19071016-V05-41-page4.txt: [('-', ''), ('WORK-', 'WORK')] WMH19071023-V05-42-page1.txt: [('com-', 'com'), ('-', ''), ('-vil', 'vil'), ('-', ''), ('confirm-', 'confirm'), ('question-', 'question')] WMH19071023-V05-42-page2.txt: [('-', ''), ('re-', 're'), ('-', '')] WMH19071023-V05-42-page3.txt: [('-field.', 'field.'), ('-cometogether.', 'cometogether.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19071023-V05-42-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19071030-V05-43-page1.txt: [('-', '')] WMH19071030-V05-43-page2.txt: [('Mich-', 'Mich')] WMH19071030-V05-43-page3.txt: [('.-', '.'), ('-', ''), ('-', ''), ('-Ps.', 'Ps.')] WMH19071030-V05-43-page4.txt: [('-both', 'both'), ('-', '')] WMH19071106-V05-44-page1.txt: [("'--", "'-"), ('GOV-', 'GOV'), ('SUPER-', 'SUPER'), ('SAB-', 'SAB')] WMH19071106-V05-44-page2.txt: [('DISESTAB-', 'DISESTAB'), ('--', '-'), ('-', '')] WMH19071106-V05-44-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('w.-', 'w.'), ('-bara', 'bara'), ('-', '')] WMH19071106-V05-44-page4.txt: [('-Note', 'Note'), ('-', ''), ('-The', 'The')] WMH19071113-V05-45-page1.txt: [('instruct-', 'instruct'), ('GATOERETI-', 'GATOERETI')] WMH19071113-V05-45-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19071113-V05-45-page4.txt: [('-', '')] WMH19071120-V05-46-page1.txt: [('eIRV-', 'eIRV'), ('GATHERETI-', 'GATHERETI'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19071120-V05-46-page2.txt: [('-with', 'with'), ('--aneyear', '-aneyear'), ('-members', 'members'), ('SERIES-', 'SERIES')] WMH19071120-V05-46-page3.txt: [('--', '-'), ('-', ''), ('Ending-', 'Ending'), ('-', '')] WMH19071127-V05-47-page1.txt: [('-', ''), ('-', ''), ('Vr-', 'Vr'), ('ans-', 'ans'), ('pre-', 'pre'), ('-and', 'and'), ('-nothingness', 'nothingness'), ('im-', 'im'), ('be-', 'be')] WMH19071127-V05-47-page2.txt: [("-widow's", "widow's"), ('-a', 'a'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-i', 'i'), ('-', ''), ('-', ''), ('-ix.', 'ix.'), ('-', '')] WMH19071127-V05-47-page3.txt: [('-', ''), ('RE-', 'RE'), ('DE-', 'DE'), ('"sur-', '"sur')] WMH19071127-V05-47-page4.txt: [('-', '')] WMH19071204-V05-48-page1.txt: [('-', ''), ('faith-', 'faith')] WMH19071204-V05-48-page2.txt: [('-', '')] WMH19071204-V05-48-page3.txt: [('-', ''), ('-Their', 'Their')] WMH19071204-V05-48-page4.txt: [('-', ''), ('reg-', 'reg')] WMH19071211-V05-49-page1.txt: [('-', ''), ('-lad', 'lad'), ('-disease', 'disease'), ('-', '')] WMH19071211-V05-49-page2.txt: [('-', ''), ('-', ''), ('the-', 'the')] WMH19071211-V05-49-page3.txt: [('-', ''), ('Carr-', 'Carr'), ('Le-', 'Le')] WMH19071211-V05-49-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19071218-V05-50-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('temp-', 'temp'), ('-', ''), ('ser-', 'ser')] WMH19071218-V05-50-page2.txt: [('-Lord.', 'Lord.'), ('in-', 'in')] WMH19071218-V05-50-page3.txt: [('-', ''), ('criti-', 'criti'), ('-', '')] WMH19071218-V05-50-page4.txt: [('-', ''), ('Ed-', 'Ed'), ('-', ''), ('--', '-'), ('-', ''), ('-', '')] WMH19080101-V06-01-page2.txt: [('r-', 'r'), ('WATCH-', 'WATCH'), ('--', '-'), ('-ceed.', 'ceed.'), ('success."-', 'success."'), ('-', ''), ('-', '')] WMH19080101-V06-01-page3.txt: [('-', ''), ('sub-', 'sub'), ('-been', 'been')] WMH19080101-V06-01-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-the', 'the'), ('consecrat-', 'consecrat'), ("-'Our", "'Our"), ('Con-', 'Con'), ('-from', 'from')] WMH19080108-V06-02-page1.txt: [('mic.-', 'mic.'), ('--Right', '-Right'), ('-', ''), ('-exciteme', 'exciteme'), ('-The', 'The'), ('gath-', 'gath'), ('-public', 'public'), ('informa-', 'informa')] WMH19080108-V06-02-page2.txt: [('-done', 'done'), ('Grand-', 'Grand'), ('-', '')] WMH19080108-V06-02-page3.txt: [('-', ''), ('-II.', 'II.'), ('-man.', 'man.'), ('-', '')] WMH19080108-V06-02-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080115-V06-03-page1.txt: [('WATCH-', 'WATCH')] WMH19080115-V06-03-page2.txt: [('-Our', 'Our'), ('WATCH-', 'WATCH'), ('-', '')] WMH19080115-V06-03-page3.txt: [('distri-', 'distri'), ('-company', 'company'), ('worle-', 'worle')] WMH19080115-V06-03-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080122-V06-04-page1.txt: [('-the', 'the'), ('pic-', 'pic'), ('faith-', 'faith')] WMH19080122-V06-04-page2.txt: [('-and', 'and'), ('fashion-', 'fashion')] WMH19080122-V06-04-page3.txt: [('-', '')] WMH19080122-V06-04-page4.txt: [('--Wellspri', '-Wellspri'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080129-V06-05-page1.txt: [('fte-', 'fte'), ('GATHERETI-', 'GATHERETI')] WMH19080129-V06-05-page2.txt: [('-', '')] WMH19080129-V06-05-page3.txt: [('And-', 'And'), ('-This', 'This'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080129-V06-05-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('BET-', 'BET'), ('-', ''), ('-', ''), ('-', '')] WMH19080205-V06-06-page1.txt: [('-..t..Pft', '..t..Pft'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-is', 'is')] WMH19080205-V06-06-page2.txt: [('church-', 'church')] WMH19080205-V06-06-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080212-V06-07-page1.txt: [('Thous-', 'Thous'), ('-', ''), ('-', ''), ('-', ''), ('Mission-', 'Mission'), ('-', ''), ('-', ''), ('Michigan-', 'Michigan'), ('Ex-', 'Ex'), ('-', ''), ('Nash-', 'Nash'), ('-', '')] WMH19080212-V06-07-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Conf.-', 'Conf.'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080212-V06-07-page3.txt: [('-present', 'present'), ('-loss', 'loss'), ('.-', '.'), ('-', ''), ('--', '-'), ('-', ''), ('-new', 'new'), ('-', '')] WMH19080212-V06-07-page4.txt: [('-', ''), ('Se-', 'Se'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080219-V06-08-page1.txt: [('-', ''), ('-', ''), ('Permits-', 'Permits'), ('Credentials-', 'Credentials'), ('Total-', 'Total'), ('employed--', 'employed-'), ('Educa-', 'Educa')] WMH19080219-V06-08-page3.txt: [('-our', 'our'), ('-', ''), ('-', '')] WMH19080219-V06-08-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080226-V06-09-page1.txt: [('-', ''), ('-', ''), ('February-', 'February'), ('-', ''), ('-', ''), ('min-', 'min')] WMH19080226-V06-09-page2.txt: [('De-', 'De'), ('peo-', 'peo')] WMH19080226-V06-09-page3.txt: [('-', ''), ('plan-', 'plan')] WMH19080226-V06-09-page4.txt: [('-', '')] WMH19080304-V06-10-page1.txt: [('-', ''), ('-Manager', 'Manager')] WMH19080304-V06-10-page2.txt: [('year-', 'year')] WMH19080304-V06-10-page4.txt: [('.-', '.')] WMH19080311-V06-11-page1.txt: [('-z', 'z'), ('-second', 'second'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('famil-', 'famil'), ('GATHERETI-', 'GATHERETI'), ('--', '-')] WMH19080311-V06-11-page3.txt: [('-and', 'and'), ('con-', 'con')] WMH19080311-V06-11-page4.txt: [('-', ''), ('-', ''), ('-', '')] WMH19080318-V06-12-page1.txt: [('-.', '.'), ('ERETI-', 'ERETI'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080318-V06-12-page3.txt: [('-', ''), ('-R.', 'R.'), ('-', '')] WMH19080318-V06-12-page4.txt: [('-', '')] WMH19080325-V06-13-page1.txt: [('Treas-', 'Treas'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-delegates', 'delegates'), ('the-', 'the'), ('-', ''), ('-', ''), ('GATMERETI-', 'GATMERETI')] WMH19080325-V06-13-page2.txt: [('-John', 'John'), ('-so', 'so'), ('-', ''), ('March-', 'March')] WMH19080325-V06-13-page4.txt: [("-cardinal'", "cardinal'"), ('-Nebr.', 'Nebr.'), ('WATCHMAN.-', 'WATCHMAN.')] WMH19080401-V06-14-page1.txt: [('unad-', 'unad'), ('-', ''), ('-', ''), ('GATRERETI-', 'GATRERETI'), ('-', '')] WMH19080401-V06-14-page2.txt: [('super-', 'super'), ('-', '')] WMH19080401-V06-14-page4.txt: [('Years-', 'Years'), ('-June', 'June'), ('-', '')] WMH19080408-V06-15-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080408-V06-15-page4.txt: [('-We', 'We')] WMH19080415-V06-16-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('Mes-', 'Mes'), ('-', ''), ('-', ''), ('-', ''), ('Mc-', 'Mc'), ('-', '')] WMH19080415-V06-16-page2.txt: [('re-', 're')] WMH19080415-V06-16-page3.txt: [('-', ''), ('-Fitch', 'Fitch'), ('-the', 'the'), ('-we', 'we'), ('Seventh-', 'Seventh'), ('-our', 'our'), ('-', '')] WMH19080422-V06-17-page1.txt: [('-', ''), ('-', ''), ('-', '')] WMH19080422-V06-17-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080422-V06-17-page4.txt: [('diet--', 'diet-')] WMH19080429-V06-18-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-', ''), ('Seventh-', 'Seventh')] WMH19080429-V06-18-page2.txt: [('-', ''), ('-', '')] WMH19080429-V06-18-page3.txt: [('busi-', 'busi'), ('-', ''), ('weep-', 'weep'), ('Seventh-', 'Seventh'), ('-reading', 'reading'), ('inform-', 'inform')] WMH19080429-V06-18-page4.txt: [('-', '')] WMH19080506-V06-19-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('Ir-', 'Ir')] WMH19080506-V06-19-page2.txt: [('peo.-', 'peo.'), ('-c.', 'c.')] WMH19080506-V06-19-page3.txt: [('-', ''), ('-', ''), ('child-', 'child'), ('du-', 'du')] WMH19080506-V06-19-page4.txt: [('---', '--'), ('-', ''), ('-', ''), ('-', '')] WMH19080513-V06-20-page1.txt: [('laborer-', 'laborer'), ('uper-', 'uper'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('GATHERETI-', 'GATHERETI'), ('z-', 'z')] WMH19080513-V06-20-page2.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080513-V06-20-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-on', 'on'), ('-', '')] WMH19080520-V06-21-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Hof-', 'Hof'), ('-', ''), ('-', ''), ('-', ''), ('-they', 'they'), ('-', ''), ('sifsisev-', 'sifsisev')] WMH19080520-V06-21-page2.txt: [('Center--', 'Center-'), ('-that', 'that'), ('-', ''), ('Ed-', 'Ed')] WMH19080520-V06-21-page3.txt: [('-will', 'will'), ('-', ''), ('-', ''), ('-', '')] WMH19080520-V06-21-page4.txt: [('-', '')] WMH19080527-V06-22-page1.txt: [('-', ''), ('-', ''), ('rush-', 'rush'), ('-', ''), ('-', ''), ('to-', 'to')] WMH19080527-V06-22-page2.txt: [('-flavors', 'flavors'), ('-', ''), ('-Two', 'Two')] WMH19080527-V06-22-page4.txt: [('-', ''), ('PRO-', 'PRO')] WMH19080603-V06-23-page1.txt: [('it-', 'it'), ('-', ''), ('mechani-', 'mechani'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080603-V06-23-page2.txt: [('-minds', 'minds')] WMH19080603-V06-23-page3.txt: [('-', '')] WMH19080603-V06-23-page4.txt: [('-', ''), ('hold-', 'hold')] WMH19080610-V06-24-page1.txt: [('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('FIELDSECRETARYr-', 'FIELDSECRETARYr')] WMH19080610-V06-24-page2.txt: [('-', ''), ('corn-', 'corn'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080610-V06-24-page3.txt: [('-we', 'we')] WMH19080610-V06-24-page4.txt: [('-', ''), ('-James', 'James'), ('HER-', 'HER')] WMH19080617-V06-25-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('flush-', 'flush')] WMH19080617-V06-25-page2.txt: [('praise."--', 'praise."-'), ('-', '')] WMH19080617-V06-25-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-------', '------'), ('-', '')] WMH19080617-V06-25-page4.txt: [('-', ''), ('-to', 'to'), ('ap-', 'ap'), ('-', '')] WMH19080624-V06-26-page1.txt: [('"Bless-', '"Bless'), ('-', ''), ('-', ''), ('I-', 'I')] WMH19080624-V06-26-page2.txt: [('-', '')] WMH19080624-V06-26-page3.txt: [('-to', 'to'), ('Frank-', 'Frank')] WMH19080624-V06-26-page4.txt: [('-', ''), ('ap-', 'ap'), ('-', ''), ('-the', 'the')] WMH19080701-V06-27-page1.txt: [('-', ''), ('-', ''), ('Lexi-', 'Lexi'), ('-', ''), ('-', '')] WMH19080701-V06-27-page2.txt: [('camp-', 'camp'), ('-', ''), ('be-', 'be'), ('-', '')] WMH19080701-V06-27-page3.txt: [('-know', 'know')] WMH19080701-V06-27-page4.txt: [('-the', 'the'), ('WATCH-', 'WATCH'), ('-', '')] WMH19080708-V06-28-page1.txt: [('-', ''), ('-', ''), ('-', '')] WMH19080708-V06-28-page3.txt: [('-', ''), ('suffering-', 'suffering'), ('-', '')] WMH19080708-V06-28-page4.txt: [('-', ''), ('-the', 'the'), ('-', ''), ('-', ''), ('as-', 'as')] WMH19080715-V06-29-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-and', 'and'), ('-', '')] WMH19080715-V06-29-page2.txt: [('-ddaayy', 'ddaayy'), ('-', ''), ('-and', 'and'), ('-', '')] WMH19080715-V06-29-page3.txt: [('-church', 'church'), ('-', '')] WMH19080715-V06-29-page4.txt: [('-', ''), ('-', '')] WMH19080722-V06-30-page1.txt: [('Seventh-', 'Seventh'), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080722-V06-30-page2.txt: [('-issue', 'issue'), ('LIB-', 'LIB'), ('the-', 'the'), ('-', '')] WMH19080722-V06-30-page4.txt: [('-convinced', 'convinced'), ('-great', 'great'), ('-', ''), ('-church', 'church'), ('--James', '-James')] WMH19080729-V06-31-page1.txt: [('-', ''), ('-', ''), ('Hof.-', 'Hof.')] WMH19080729-V06-31-page2.txt: [('-', '')] WMH19080729-V06-31-page3.txt: [('depart-', 'depart'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-Park', 'Park'), ('-', ''), ('let-', 'let')] WMH19080729-V06-31-page4.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080805-V06-32-page1.txt: [('be-', 'be'), ('-', ''), ('-', ''), ('-', ''), ('COM-', 'COM'), ('sub-', 'sub')] WMH19080805-V06-32-page2.txt: [('ox-', 'ox'), ('-', '')] WMH19080805-V06-32-page3.txt: [('-to', 'to'), ('na-', 'na')] WMH19080805-V06-32-page4.txt: [('-', ''), ('-the', 'the')] WMH19080812-V06-33-page1.txt: [('-', ''), ('-', ''), ('af-', 'af'), ('-', ''), ('-', ''), ('-', '')] WMH19080812-V06-33-page2.txt: [('-', '')] WMH19080812-V06-33-page3.txt: [('GENER-', 'GENER'), ('ex-', 'ex')] WMH19080812-V06-33-page4.txt: [('Publish-', 'Publish')] WMH19080826-V06-34-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19080826-V06-34-page2.txt: [('Anti-', 'Anti')] WMH19080826-V06-34-page3.txt: [('class-', 'class'), ('-', ''), ('-made', 'made')] WMH19080826-V06-34-page4.txt: [('-', '')] WMH19080902-V06-35-page1.txt: [('-our', 'our'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--SELECTED.', '-SELECTED.'), ('-', '')] WMH19080902-V06-35-page2.txt: [('-opportune', 'opportune'), ('followed-', 'followed')] WMH19080902-V06-35-page3.txt: [('-', '')] WMH19080902-V06-35-page4.txt: [('-', '')] WMH19080909-V06-36-page1.txt: [('-I', 'I'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('night--', 'night-'), ('-', '')] WMH19080909-V06-36-page2.txt: [('auspi-', 'auspi')] WMH19080909-V06-36-page3.txt: [('-', ''), ('-', '')] WMH19080909-V06-36-page4.txt: [('-', '')] WMH19080916-V06-37-page1.txt: [('-', ''), ('Michigan.-', 'Michigan.'), ('r-', 'r'), ('-', ''), ('-', ''), ('-', ''), ('-arise', 'arise'), ('-holy', 'holy'), ('con-', 'con')] WMH19080916-V06-37-page2.txt: [('-', ''), ('-', ''), ('-neither', 'neither'), ('the-', 'the'), ('-for', 'for'), ('-pay', 'pay')] WMH19080916-V06-37-page3.txt: [('shad-', 'shad'), ('-consummation.', 'consummation.'), ('-', '')] WMH19080916-V06-37-page4.txt: [('Place-', 'Place'), ('-', ''), ('-', ''), ('the-', 'the'), ('Almeda-', 'Almeda')] WMH19080923-V06-38-page1.txt: [('-', ''), ('-', ''), ('-A--udi.t', 'A--udi.t'), ('-', ''), ('-', ''), ('-', ''), ('the-', 'the')] WMH19080923-V06-38-page2.txt: [('dif-', 'dif'), ('-', ''), ('-', ''), ('-', '')] WMH19080923-V06-38-page3.txt: [('G.-', 'G.'), ('-', ''), ('un-', 'un')] WMH19080923-V06-38-page4.txt: [('-you', 'you'), ('-should', 'should')] WMH19080930-V06-39-page1.txt: [('-', ''), ('-and', 'and'), ('go-', 'go'), ('-', ''), ('-', '')] WMH19080930-V06-39-page2.txt: [('-the', 'the')] WMH19080930-V06-39-page3.txt: [('-', ''), ('-', ''), ('influ.-', 'influ.'), ('-and', 'and'), ('-', '')] WMH19080930-V06-39-page4.txt: [('-', ''), ('Conference.-', 'Conference.'), ('-', ''), ('--Mrs.', '-Mrs.'), ('-many', 'many'), ('Van-', 'Van')] WMH19081007-V06-40-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('Hof-', 'Hof')] WMH19081007-V06-40-page2.txt: [('--Ans.', '-Ans.')] WMH19081007-V06-40-page3.txt: [('-Business', 'Business'), ('Mateo-', 'Mateo'), ('ENLIGHTEN-', 'ENLIGHTEN')] WMH19081007-V06-40-page4.txt: [('-heard', 'heard'), ('mis-', 'mis'), ('mes-', 'mes'), ('-', ''), ('-', '')] WMH19081014-V06-41-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('Hof--', 'Hof-'), ('-', ''), ('--', '-'), ('cL-', 'cL')] WMH19081014-V06-41-page2.txt: [('-', ''), ('-', ''), ('-Labor', 'Labor'), ('re-', 're'), ('-the', 'the'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19081014-V06-41-page3.txt: [('commit-', 'commit'), ('develop-', 'develop'), ('corn--', 'corn-')] WMH19081014-V06-41-page4.txt: [('advo-', 'advo'), ('be-', 'be'), ('life.-', 'life.'), ('illus-', 'illus'), ('-to', 'to')] WMH19081021-V06-42-page1.txt: [('especi-', 'especi'), ('-', ''), ('-', ''), ('-', '')] WMH19081021-V06-42-page2.txt: [('for-', 'for')] WMH19081021-V06-42-page3.txt: [('truth-', 'truth'), ('-', ''), ('-continue', 'continue'), ('Pool-', 'Pool')] WMH19081028-V06-43-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('con.-', 'con.'), ('-fulfilled', 'fulfilled')] WMH19081028-V06-43-page2.txt: [('RE-', 'RE'), ('-', '')] WMH19081028-V06-43-page3.txt: [('-this', 'this'), ('-', ''), ('-So', 'So'), ('-Soon', 'Soon')] WMH19081028-V06-43-page4.txt: [('-Growth', 'Growth'), ('-Philippians.', 'Philippians.'), ('-page', 'page')] WMH19081104-V06-44-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19081104-V06-44-page2.txt: [('-obtain', 'obtain'), ('-to', 'to')] WMH19081104-V06-44-page3.txt: [('.-', '.'), ('-', ''), ('quar-', 'quar'), ('Sab-', 'Sab'), ('-', '')] WMH19081104-V06-44-page4.txt: [('Danish-', 'Danish'), ('Danish-', 'Danish'), ('Danish-', 'Danish'), ('MICHI-', 'MICHI')] WMH19081111-V06-45-page1.txt: [('na-', 'na'), ('-', ''), ('-C.', 'C.'), ('-', ''), ('-', ''), ('P-', 'P')] WMH19081111-V06-45-page2.txt: [('Harbor-', 'Harbor'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] WMH19081111-V06-45-page3.txt: [('-', ''), ('firmame-', 'firmame'), ('-', ''), ('-bare', 'bare'), ('-', ''), ('-', '')] WMH19081111-V06-45-page5.txt: [('LJesY-', 'LJesY')]
In [30]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/correction3 Average verified rate: 0.9762576375579471 Average of error rates: 0.02556951102588687 Total token count: 915705
In [31]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[31]:
[('m', 1749), ('w', 1503), ('g', 1437), ('e', 1313), ('d', 1278), ('r', 688), ('n', 645), ("'", 511), ('f', 444), ('t', 382), ('th', 283), ('co', 172), ('oo', 171), ('sabbathschool', 163), ('io', 120), ('mt', 108), ('k', 107), ('ro', 96), ('wm', 82), ('numbess', 75), ('re', 71), ('u', 69), ("'field", 67), ("canvassers'", 58), ('x', 46), ("'the", 44), ('horr', 39), ("the'", 38), ('rd', 33), ('blendon', 32), ('ex', 32), ('brower', 31), ('harnden', 30), ("f'd", 30), ('mchugh', 29), ('seventhday', 28), ('nd', 28), ('cleora', 27), ('tion', 25), ('nunica', 23), ('sabbathschools', 23), ('q', 23), ("'to", 22), ('-', 21), ('vowyla', 21), ('al', 21), ('z', 20), ('loth', 20), ('fd', 20), ('michi', 20)]
Correction 4 -- Remove extra quotation marks¶
In [33]:
# %load shared_elements/remove_extra_quotation_marks.py
prev = cycle
cycle = "correction4"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
corrections = []
for token in tokens:
token_list = list(token)
last_char = token_list[-1]
if last_char is "'":
if len(token) > 1:
if token_list[-2] is 's' or 'S':
pass
else:
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
elif token[0] is "'":
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
if len(corrections) > 0:
print('{}: {}'.format(filename, corrections))
for correction in corrections:
content = clean.replace_pair(correction, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
WMH19030513-V01-19-page1.txt: [("'Alarmed", 'Alarmed')] WMH19030520-V01-20-page4.txt: [("'born", 'born')] WMH19030527-V01-21-page4.txt: [("'An", 'An')] WMH19030610-V01-23-page1.txt: [("'Evangelical", 'Evangelical')] WMH19030610-V01-23-page2.txt: [("'promotes", 'promotes')] WMH19030610-V01-23-page4.txt: [("'Rem", 'Rem'), ("'Rotes", 'Rotes')] WMH19030624-V01-25-page4.txt: [("'dews", 'dews'), ("'notes", 'notes')] WMH19030701-V01-26-page4.txt: [("'news", 'news'), ("'Pews", 'Pews'), ("'Rotes", 'Rotes')] WMH19030708-V01-27-page2.txt: [("'Department", 'Department')] WMH19030708-V01-27-page4.txt: [("'school", 'school'), ("'Mews", 'Mews'), ("'Rotes", 'Rotes')] WMH19030715-V01-28-page1.txt: [("'Tis", 'Tis')] WMH19030715-V01-28-page3.txt: [("'.Died", '.Died')] WMH19030715-V01-28-page4.txt: [("'Flews", 'Flews'), ("'Hews", 'Hews')] WMH19030722-V01-29-page2.txt: [("'and", 'and')] WMH19030722-V01-29-page4.txt: [("'Flews", 'Flews'), ("'Motes", 'Motes'), ("'Pews", 'Pews')] WMH19030930-V01-39-page3.txt: [("'pecan", 'pecan')] WMH19030930-V01-39-page4.txt: [("'news", 'news'), ("'Notes", 'Notes')] WMH19031028-V01-43-page1.txt: [("'self", 'self')] WMH19031028-V01-43-page3.txt: [("'all", 'all')] WMH19031028-V01-43-page4.txt: [("'Hews", 'Hews'), ("'by", 'by'), ("'we", 'we'), ("'Hews", 'Hews')] WMH19031118-V01-46-page4.txt: [("'Flews", 'Flews'), ("'notes", 'notes'), ("'each", 'each')] WMH19040106-V02-02-page1.txt: [("'the", 'the')] WMH19040113-V02-03-page3.txt: [("'We", 'We')] WMH19040113-V02-03-page4.txt: [("'Battle", 'Battle')] WMH19040210-V02-06-page2.txt: [("'treatments", 'treatments'), ("'summer", 'summer'), ("'all", 'all')] WMH19040210-V02-06-page3.txt: [("'Creek", 'Creek')] WMH19040210-V02-06-page4.txt: [("'dress", 'dress')] WMH19040217-V02-07-page1.txt: [("'traverse", 'traverse'), ("'Joseph", 'Joseph'), ("'loved", 'loved'), ("'book", 'book'), ("'the", 'the')] WMH19040224-V02-08-page2.txt: [("'nurses", 'nurses')] WMH19040302-V02-09-page2.txt: [("'present", 'present')] WMH19040302-V02-09-page3.txt: [("'stairs", 'stairs')] WMH19040309-V02-10-page2.txt: [("'Another", 'Another')] WMH19040309-V02-10-page3.txt: [("'persons", 'persons')] WMH19040309-V02-10-page4.txt: [("'I", 'I'), ("'My", 'My')] WMH19040316-V02-11-page1.txt: [("'Michigan", 'Michigan')] WMH19040316-V02-11-page2.txt: [("'the", 'the'), ("'More", 'More')] WMH19040316-V02-11-page3.txt: [("'great", 'great')] WMH19040323-V02-12-page1.txt: [("'we", 'we')] WMH19040323-V02-12-page2.txt: [("'book", 'book')] WMH19040323-V02-12-page3.txt: [("'to", 'to')] WMH19040330-V02-13-page3.txt: [("'unable", 'unable')] WMH19040406-V02-14-page2.txt: [("'the", 'the'), ("'in", 'in'), ("'eight", 'eight')] WMH19040406-V02-14-page3.txt: [("'lungs", 'lungs')] WMH19040420-V02-16-page1.txt: [("'made", 'made'), ("'Not", 'Not')] WMH19040420-V02-16-page2.txt: [("'been", 'been'), ("'With", 'With')] WMH19040420-V02-16-page4.txt: [("'Conference.", 'Conference.')] WMH19040427-V02-17-page1.txt: [("'Michigan", 'Michigan'), ("'branch", 'branch')] WMH19040427-V02-17-page2.txt: [("'occassionally.", 'occassionally.'), ("'of", 'of')] WMH19040427-V02-17-page3.txt: [("'Thoburn", 'Thoburn')] WMH19040504-V02-18-page2.txt: [("'it", 'it'), ("'evil", 'evil')] WMH19040504-V02-18-page3.txt: [("'A", 'A'), ("'meal", 'meal')] WMH19040511-V02-19-page1.txt: [("'I", 'I'), ("'that", 'that'), ("'tanner.", 'tanner.')] WMH19040511-V02-19-page2.txt: [("'up", 'up')] WMH19040511-V02-19-page3.txt: [("'Freas.", 'Freas.'), ("'Financial.", 'Financial.')] WMH19040518-V02-20-page1.txt: [("'Was", 'Was'), ("'warm", 'warm'), ("'this", 'this'), ("'others", 'others'), ("'influence", 'influence')] WMH19040518-V02-20-page2.txt: [("'go", 'go')] WMH19040601-V02-22-page2.txt: [("'that", 'that')] WMH19040601-V02-22-page4.txt: [("'Kenyon", 'Kenyon')] WMH19040608-V02-23-page1.txt: [("'Michigan", 'Michigan'), ("'large", 'large'), ("'and", 'and')] WMH19040608-V02-23-page3.txt: [("'called", 'called'), ("'to", 'to'), ("'the", 'the')] WMH19040608-V02-23-page4.txt: [("'was", 'was')] WMH19040622-V02-24-page1.txt: [("'pay", 'pay')] WMH19040622-V02-24-page4.txt: [("'and", 'and')] WMH19040629-V02-25-page1.txt: [("'the", 'the')] WMH19040629-V02-25-page4.txt: [("'Conference", 'Conference')] WMH19040706-V02-26-page4.txt: [("'in", 'in'), ("'HERALD", 'HERALD')] WMH19040713-V02-27-page2.txt: [("'he", 'he')] WMH19040713-V02-27-page3.txt: [("'session", 'session'), ("'to", 'to')] WMH19040720-V02-28-page1.txt: [("'Reading", 'Reading')] WMH19040720-V02-28-page2.txt: [("'when", 'when')] WMH19040720-V02-28-page3.txt: [("'of", 'of'), ("'teacher", 'teacher'), ("'and", 'and')] WMH19040720-V02-28-page4.txt: [("'subscriptions", 'subscriptions'), ("'I'.", 'I.')] WMH19040727-V02-29-page2.txt: [("'favor", 'favor')] WMH19040810-V02-31-page1.txt: [("'Much", 'Much')] WMH19040810-V02-31-page2.txt: [("'s", 's')] WMH19040810-V02-31-page3.txt: [("'disease.", 'disease.'), ("'state", 'state')] WMH19040817-V02-32-page2.txt: [("'and", 'and'), ("'reaching", 'reaching')] WMH19040817-V02-32-page4.txt: [("'We", 'We')] WMH19040831-V02-33-page1.txt: [("'the", 'the')] WMH19040831-V02-33-page2.txt: [("'as", 'as')] WMH19040831-V02-33-page3.txt: [("'Breads", 'Breads')] WMH19040914-V02-34-page1.txt: [("'Michigan", 'Michigan'), ("'poverty.", 'poverty.')] WMH19040914-V02-34-page2.txt: [("'great", 'great')] WMH19040914-V02-34-page3.txt: [("'they", 'they'), ("'a", 'a')] WMH19040914-V02-34-page4.txt: [("'opened", 'opened'), ("'school", 'school')] WMH19040928-V02-35-page3.txt: [("'twelve", 'twelve')] WMH19041005-V02-36-page2.txt: [("'c.", 'c.')] WMH19041005-V02-36-page3.txt: [("'I'REAMENT.", 'IREAMENT.'), ("'a", 'a')] WMH19041005-V02-36-page4.txt: [("'September.", 'September.')] WMH19041012-V02-37-page1.txt: [("'aostile", 'aostile'), ("'brief", 'brief')] WMH19041012-V02-37-page2.txt: [("'of", 'of')] WMH19041019-V02-38-page3.txt: [("'devoted", 'devoted'), ("'These", 'These'), ("'to", 'to'), ("'for", 'for')] WMH19041026-V02-39-page1.txt: [("'new", 'new')] WMH19041026-V02-39-page3.txt: [("'that", 'that'), ("'part", 'part')] WMH19041026-V02-39-page4.txt: [("'Missionary", 'Missionary'), ("'education", 'education')] WMH19041102-V02-40-page1.txt: [("'magnitude", 'magnitude')] WMH19041102-V02-40-page2.txt: [("'work.", 'work.')] WMH19041102-V02-40-page3.txt: [("'when", 'when')] WMH19041102-V02-40-page4.txt: [("'doing", 'doing'), ("'The", 'The')] WMH19041109-V02-41-page2.txt: [("'Sabbath-keepers", 'Sabbath-keepers')] WMH19041109-V02-41-page4.txt: [("'rule", 'rule')] WMH19041116-V02-42-page3.txt: [("'the", 'the')] WMH19041123-V02-43-page1.txt: [("'of", 'of'), ("'prayer", 'prayer')] WMH19041123-V02-43-page3.txt: [("'The", 'The')] WMH19041123-V02-43-page4.txt: [("'the", 'the')] WMH19041130-V02-44-page1.txt: [("'to", 'to')] WMH19041130-V02-44-page2.txt: [("'a", 'a'), ("'association", 'association')] WMH19041130-V02-44-page3.txt: [("'scold", 'scold')] WMH19041207-V02-45-page2.txt: [("'us", 'us'), ("'RECORD.", 'RECORD.')] WMH19041207-V02-45-page3.txt: [("'services", 'services')] WMH19041214-V02-46-page3.txt: [("'Food", 'Food'), ("'remove", 'remove'), ("'be", 'be')] WMH19041214-V02-46-page4.txt: [("'Jesus", 'Jesus')] WMH19041221-V02-47-page2.txt: [("'When", 'When'), ("'I", 'I'), ("'Each", 'Each')] WMH19041221-V02-47-page4.txt: [("'message", 'message'), ("'fourth", 'fourth'), ("'up", 'up')] WMH19041228-V02-48-page1.txt: [("'Wealthy", 'Wealthy')] WMH19041228-V02-48-page3.txt: [("'you", 'you')] WMH19050104-V03-01-page2.txt: [("'the", 'the'), ("'NOW.", 'NOW.')] WMH19050104-V03-01-page4.txt: [("'A", 'A')] WMH19050118-V03-03-page1.txt: [("'matter", 'matter')] WMH19050118-V03-03-page2.txt: [("'courage", 'courage'), ("'Christ's", 'Christs'), ("'The", 'The'), ("'Object", 'Object'), ("'Object", 'Object'), ("'Object", 'Object')] WMH19050118-V03-03-page3.txt: [("'Object", 'Object'), ("'Christ's", 'Christs'), ("'Object", 'Object'), ("'Object", 'Object'), ("'Object", 'Object')] WMH19050208-V03-05-page1.txt: [("'kind", 'kind')] WMH19050208-V03-05-page2.txt: [("'seventeen", 'seventeen')] WMH19050215-V03-06-page3.txt: [("'Hesperia", 'Hesperia'), ("'A", 'A')] WMH19050215-V03-06-page4.txt: [("'William", 'William')] WMH19050222-V03-07-page1.txt: [("'The", 'The')] WMH19050222-V03-07-page3.txt: [("'It", 'It'), ("'work.", 'work.')] WMH19050301-V03-08-page1.txt: [("'That", 'That')] WMH19050301-V03-08-page2.txt: [("'W.", 'W.')] WMH19050315-V03-10-page2.txt: [("'F", 'F'), ("'Southern", 'Southern')] WMH19050315-V03-10-page4.txt: [("'to", 'to')] WMH19050322-V03-11-page1.txt: [("'and", 'and')] WMH19050322-V03-11-page5.txt: [("'WI", 'WI')] WMH19050329-V03-12-page3.txt: [("'feature", 'feature'), ("'field.", 'field.')] WMH19050405-V03-13-page1.txt: [("'have", 'have')] WMH19050405-V03-13-page2.txt: [("'to", 'to'), ("'intensely", 'intensely')] WMH19050413-V03-14-page1.txt: [("'Sept.", 'Sept.')] WMH19050413-V03-14-page2.txt: [("'officers", 'officers')] WMH19050413-V03-14-page3.txt: [("'The", 'The')] WMH19050419-V03-15-page2.txt: [("'to", 'to')] WMH19050419-V03-15-page3.txt: [("'on", 'on')] WMH19050419-V03-15-page4.txt: [("'Cedar", 'Cedar')] WMH19050426-V03-16-page1.txt: [("'increase", 'increase'), ("'effect", 'effect'), ("'that", 'that')] WMH19050426-V03-16-page4.txt: [("'Tis", 'Tis'), ("'patients.", 'patients.')] WMH19050503-V03-17-page1.txt: [("'Fhat", 'Fhat'), ("'of", 'of')] WMH19050503-V03-17-page2.txt: [("'educational", 'educational'), ("'In", 'In')] WMH19050510-V03-18-page3.txt: [("'is", 'is'), ("'task", 'task')] WMH19050510-V03-18-page4.txt: [("'having", 'having'), ("'Come", 'Come'), ("'Here", 'Here')] WMH19050517-V03-19-page1.txt: [("'work", 'work'), ("'weightiest", 'weightiest')] WMH19050517-V03-19-page4.txt: [("'Conference", 'Conference')] WMH19050524-V03-20-page1.txt: [("'and", 'and')] WMH19050524-V03-20-page2.txt: [("'Christ's", 'Christs')] WMH19050524-V03-20-page3.txt: [("'Field.", 'Field.')] WMH19050531-V03-21-page1.txt: [("'leprosy", 'leprosy')] WMH19050531-V03-21-page2.txt: [("'but", 'but'), ("'Field.", 'Field.'), ("'Now", 'Now')] WMH19050531-V03-21-page4.txt: [("'Twice", 'Twice'), ("'already", 'already')] WMH19050607-V03-22-page1.txt: [("'and", 'and')] WMH19050607-V03-22-page3.txt: [("'EDITH", 'EDITH')] WMH19050607-V03-22-page4.txt: [("'.", '.')] WMH19050614-V03-23-page1.txt: [("'Surely", 'Surely')] WMH19050614-V03-23-page4.txt: [("'He", 'He')] WMH19050621-V03-24-page1.txt: [("'class", 'class')] WMH19050621-V03-24-page2.txt: [("'Bring", 'Bring')] WMH19050621-V03-24-page3.txt: [("'the", 'the'), ("'Sing", 'Sing'), ("'The", 'The'), ("'They", 'They'), ("'As", 'As'), ("'For", 'For')] WMH19050621-V03-24-page4.txt: [("'body", 'body')] WMH19050628-V03-25-page1.txt: [("'and", 'and'), ("'RESOLVED", 'RESOLVED')] WMH19050628-V03-25-page2.txt: [("'Desire", 'Desire'), ("'Desire", 'Desire'), ("'prominent.", 'prominent.')] WMH19050705-V03-26-page1.txt: [("'twer", 'twer')] WMH19050705-V03-26-page2.txt: [("'are", 'are')] WMH19050705-V03-26-page3.txt: [("'Field.", 'Field.'), ("'Hours", 'Hours'), ("'greater", 'greater')] WMH19050705-V03-26-page4.txt: [("'Be", 'Be')] WMH19050712-V03-27-page2.txt: [("'should", 'should')] WMH19050712-V03-27-page3.txt: [("'financial.", 'financial.')] WMH19050719-V03-28-page3.txt: [("'Field.", 'Field.')] WMH19050719-V03-28-page4.txt: [("'The", 'The')] WMH19050726-V03-29-page1.txt: [("'August", 'August')] WMH19050726-V03-29-page2.txt: [("'Field.", 'Field.'), ("'The", 'The')] WMH19050802-V03-30-page1.txt: [("'Hume.", 'Hume.')] WMH19050802-V03-30-page2.txt: [("'Practical", 'Practical')] WMH19050802-V03-30-page3.txt: [("'school", 'school'), ("'Field.", 'Field.')] WMH19050802-V03-30-page4.txt: [("'God", 'God')] WMH19050809-V03-31-page2.txt: [("'for", 'for'), ("'Field.", 'Field.'), ("'upon", 'upon'), ("'Glenwood", 'Glenwood')] WMH19050809-V03-31-page3.txt: [("'to", 'to')] WMH19050816-V03-32-page1.txt: [("'in", 'in')] WMH19050816-V03-32-page3.txt: [("'Jews", 'Jews')] WMH19050816-V03-32-page4.txt: [("'of", 'of'), ("'Sanctify", 'Sanctify')] WMH19050830-V03-33-page1.txt: [("'largest", 'largest'), ("'the", 'the')] WMH19050830-V03-33-page4.txt: [("'Follow", 'Follow')] WMH19050906-V03-34-page1.txt: [("'schools", 'schools')] WMH19050906-V03-34-page3.txt: [("'Financial.", 'Financial.')] WMH19050906-V03-34-page4.txt: [("'young", 'young'), ("'tis", 'tis'), ("'tis", 'tis')] WMH19050913-V03-35-page2.txt: [("'lead", 'lead')] WMH19050913-V03-35-page4.txt: [("'last", 'last')] WMH19050920-V03-36-page1.txt: [("'field.", 'field.')] WMH19050920-V03-36-page2.txt: [("'quiet", 'quiet'), ("'to", 'to'), ("'to", 'to')] WMH19050927-V03-37-page1.txt: [("'at", 'at'), ("'those", 'those')] WMH19050927-V03-37-page2.txt: [("'were", 'were')] WMH19050927-V03-37-page3.txt: [("'come", 'come'), ("'lifting", 'lifting')] WMH19051004-V03-38-page3.txt: [("'tis", 'tis')] WMH19051004-V03-38-page4.txt: [("'Edward", 'Edward')] WMH19051011-V03-39-page1.txt: [("'the", 'the'), ("'the", 'the')] WMH19051011-V03-39-page3.txt: [("'done", 'done')] WMH19051011-V03-39-page4.txt: [("'This", 'This')] WMH19051018-V03-40-page1.txt: [("'went", 'went')] WMH19051018-V03-40-page2.txt: [("'Field.", 'Field.')] WMH19051018-V03-40-page3.txt: [("'great", 'great')] WMH19051018-V03-40-page4.txt: [("'one", 'one'), ("'WEST", 'WEST')] WMH19051025-V03-41-page1.txt: [("'him", 'him')] WMH19051025-V03-41-page2.txt: [("'of", 'of')] WMH19051025-V03-41-page3.txt: [("'twill", 'twill'), ("'Field.", 'Field.')] WMH19051101-V03-42-page2.txt: [("'field.", 'field.')] WMH19051101-V03-42-page3.txt: [("'ferers", 'ferers')] WMH19051108-V03-43-page1.txt: [("'Field.", 'Field.')] WMH19051108-V03-43-page3.txt: [("'be", 'be')] WMH19051115-V03-44-page4.txt: [("'school", 'school')] WMH19051122-V03-45-page1.txt: [("'Seeking", 'Seeking')] WMH19051122-V03-45-page3.txt: [("'Week", 'Week')] WMH19051129-V03-46-page2.txt: [("'at", 'at'), ("'Enter", 'Enter'), ("'Field.", 'Field.')] WMH19051129-V03-46-page3.txt: [("'Word.", 'Word.')] WMH19051206-V03-47-page2.txt: [("'are", 'are')] WMH19051213-V03-48-page1.txt: [("'Tis", 'Tis'), ("'Tis", 'Tis')] WMH19051213-V03-48-page2.txt: [("'Field.", 'Field.')] WMH19051213-V03-48-page4.txt: [("'voiced", 'voiced'), ("'three", 'three'), ("'for", 'for')] WMH19051220-V03-49-page1.txt: [("'tis", 'tis'), ("'All", 'All')] WMH19051220-V03-49-page2.txt: [("'.", '.'), ("'.", '.')] WMH19051220-V03-49-page3.txt: [("'Field.", 'Field.')] WMH19051227-V03-50-page1.txt: [("'study", 'study')] WMH19051227-V03-50-page3.txt: [("'margin.", 'margin.')] WMH19051227-V03-50-page4.txt: [('\'"', '"')] WMH19060103-V04-01-page1.txt: [("'field.", 'field.')] WMH19060103-V04-01-page4.txt: [("'Christ", 'Christ')] WMH19060110-V04-02-page2.txt: [("'for", 'for'), ('\'strength."', 'strength."'), ("'Financial.", 'Financial.')] WMH19060110-V04-02-page3.txt: [("'that", 'that'), ("'that", 'that')] WMH19060110-V04-02-page4.txt: [("'paper", 'paper')] WMH19060117-V04-03-page4.txt: [("'o", 'o'), ("'o", 'o')] WMH19060124-V04-04-page1.txt: [("'I", 'I'), ("'ministers", 'ministers')] WMH19060124-V04-04-page2.txt: [("'for", 'for'), ("'in", 'in')] WMH19060131-V04-05-page2.txt: [("'Statement", 'Statement'), ("'o", 'o'), ("'o", 'o'), ("'o", 'o')] WMH19060131-V04-05-page3.txt: [("'late", 'late'), ("'field.", 'field.')] WMH19060131-V04-05-page4.txt: [("'E.", 'E.')] WMH19060207-V04-06-page3.txt: [("'Field.", 'Field.')] WMH19060214-V04-07-page1.txt: [("'for", 'for')] WMH19060214-V04-07-page2.txt: [("'should", 'should')] WMH19060214-V04-07-page3.txt: [("'that", 'that')] WMH19060221-V04-08-page1.txt: [("'church", 'church')] WMH19060221-V04-08-page2.txt: [("'.ed", '.ed')] WMH19060221-V04-08-page3.txt: [("'operation", 'operation')] WMH19060221-V04-08-page4.txt: [("'or", 'or')] WMH19060228-V04-09-page3.txt: [("'Field.", 'Field.'), ("'Turn", 'Turn')] WMH19060228-V04-09-page4.txt: [("'U.", 'U.')] WMH19060307-V04-10-page2.txt: [("'a", 'a'), ("'Field.", 'Field.')] WMH19060307-V04-10-page3.txt: [("'are", 'are'), ("'we", 'we')] WMH19060314-V04-11-page2.txt: [("'C.", 'C.')] WMH19060314-V04-11-page3.txt: [("'Hosanna", 'Hosanna')] WMH19060321-V04-12-page1.txt: [("'worker", 'worker')] WMH19060321-V04-12-page2.txt: [("'Field.", 'Field.'), ("'whom", 'whom')] WMH19060321-V04-12-page3.txt: [("'the", 'the')] WMH19060328-V04-13-page2.txt: [("'Field.", 'Field.')] WMH19060411-V04-15-page1.txt: [("'Union", 'Union')] WMH19060411-V04-15-page2.txt: [("'Financial.", 'Financial.'), ("'Field.", 'Field.')] WMH19060411-V04-15-page3.txt: [("'so", 'so')] WMH19060411-V04-15-page4.txt: [("'amount", 'amount'), ("'Dietetics", 'Dietetics')] WMH19060425-V04-17-page1.txt: [("'for", 'for')] WMH19060425-V04-17-page3.txt: [("'of", 'of'), ("'crowned", 'crowned')] WMH19060425-V04-17-page4.txt: [("'straw", 'straw')] WMH19060502-V04-18-page3.txt: [("'goo.", 'goo.'), ("'a", 'a'), ("'will", 'will'), ("'small", 'small')] WMH19060502-V04-18-page4.txt: [("'.Wells", '.Wells')] WMH19060509-V04-19-page1.txt: [("'i", 'i')] WMH19060509-V04-19-page3.txt: [("'Field.", 'Field.'), ("'to", 'to')] WMH19060509-V04-19-page4.txt: [("'the", 'the')] WMH19060523-V04-20-page3.txt: [("'Field.", 'Field.')] WMH19060530-V04-21-page2.txt: [("'this", 'this')] WMH19060530-V04-21-page3.txt: [("'greater", 'greater'), ("'Field.", 'Field.')] WMH19060613-V04-23-page3.txt: [("'To", 'To')] WMH19060613-V04-23-page4.txt: [("'new", 'new')] WMH19060620-V04-24-page1.txt: [("'been", 'been')] WMH19060620-V04-24-page3.txt: [("'them", 'them')] WMH19060627-V04-25-page2.txt: [("'Field.", 'Field.')] WMH19060627-V04-25-page4.txt: [("'up", 'up')] WMH19060704-V04-26-page1.txt: [("'Bible", 'Bible'), ("'SEGO", 'SEGO')] WMH19060704-V04-26-page3.txt: [("'Tis", 'Tis')] WMH19060704-V04-26-page4.txt: [("'the", 'the'), ("'at", 'at'), ("'in", 'in')] WMH19060711-V04-27-page1.txt: [("'tis", 'tis')] WMH19060711-V04-27-page3.txt: [("'lath", 'lath'), ('\'Times"', 'Times"')] WMH19060711-V04-27-page4.txt: [("'Field.", 'Field.')] WMH19060718-V04-28-page1.txt: [("'ilm", 'ilm')] WMH19060725-V04-29-page1.txt: [("'Field.", 'Field.'), ("'tiff", 'tiff'), ('\'"', '"')] WMH19060725-V04-29-page3.txt: [("'prayer", 'prayer'), ("'and", 'and')] WMH19060725-V04-29-page4.txt: [("'EZRA", 'EZRA')] WMH19060801-V04-30-page1.txt: [("'great", 'great'), ("'We", 'We')] WMH19060801-V04-30-page2.txt: [("'loose", 'loose'), ("'Ye", 'Ye'), ("'us", 'us'), ("'victory", 'victory')] WMH19060801-V04-30-page3.txt: [("'Field.", 'Field.')] WMH19060801-V04-30-page4.txt: [("'When", 'When'), ("'And", 'And')] WMH19060808-V04-31-page3.txt: [("'Cedar", 'Cedar'), ("'Field.", 'Field.')] WMH19060822-V04-32-page2.txt: [("'perfect", 'perfect'), ("'text", 'text'), ("'may", 'may'), ("'be", 'be'), ("'SUNDAY", 'SUNDAY')] WMH19060822-V04-32-page3.txt: [("'Good", 'Good'), ("'that", 'that'), ("'that", 'that'), ("'.", '.')] WMH19060829-V04-33-page4.txt: [("'twas", 'twas'), ("'.", '.'), ("'Twill", 'Twill'), ("'keep", 'keep'), ("'Tis", 'Tis'), ("'tis", 'tis'), ("'keep", 'keep')] WMH19060905-V04-34-page1.txt: [("'Christ.", 'Christ.'), ("'made", 'made')] WMH19060905-V04-34-page2.txt: [("'work", 'work')] WMH19060905-V04-34-page4.txt: [("'Without", 'Without')] WMH19060912-V04-35-page1.txt: [("'T", 'T')] WMH19060912-V04-35-page2.txt: [("'train", 'train')] WMH19060912-V04-35-page3.txt: [("'before", 'before')] WMH19060912-V04-35-page4.txt: [("'Field.", 'Field.')] WMH19060919-V04-36-page1.txt: [("'Till", 'Till'), ("'Twill", 'Twill'), ("'Twill", 'Twill'), ("'people.", 'people.')] WMH19060919-V04-36-page4.txt: [("'be", 'be')] WMH19060926-V04-37-page1.txt: [("'of", 'of')] WMH19060926-V04-37-page2.txt: [("'Why", 'Why'), ("'I", 'I')] WMH19060926-V04-37-page3.txt: [("'them", 'them')] WMH19061003-V04-38-page2.txt: [("'they", 'they')] WMH19061003-V04-38-page3.txt: [("'being", 'being'), ("'variety", 'variety'), ("'cloth", 'cloth')] WMH19061003-V04-38-page4.txt: [("'Bertha", 'Bertha')] WMH19061010-V04-39-page2.txt: [("'s", 's'), ("'is", 'is')] WMH19061010-V04-39-page3.txt: [("'Field.", 'Field.')] WMH19061010-V04-39-page4.txt: [("'reports", 'reports'), ("'twixt", 'twixt'), ("'tis", 'tis'), ("'twixt", 'twixt'), ("'and", 'and')] WMH19061017-V04-40-page1.txt: [("'the", 'the'), ("'.", '.'), ("'field.", 'field.')] WMH19061017-V04-40-page3.txt: [("'followed", 'followed')] WMH19061024-V04-41-page2.txt: [("'the", 'the')] WMH19061024-V04-41-page3.txt: [("'field.", 'field.'), ("'look", 'look')] WMH19061031-V04-42-page1.txt: [("'W.", 'W.')] WMH19061031-V04-42-page2.txt: [("'disposition", 'disposition'), ("'We", 'We')] WMH19061031-V04-42-page3.txt: [("'Field.", 'Field.')] WMH19061031-V04-42-page4.txt: [("'But", 'But')] WMH19061107-V04-43-page1.txt: [("'well", 'well')] WMH19061107-V04-43-page2.txt: [("'root", 'root'), ("'Christ's", 'Christs')] WMH19061107-V04-43-page3.txt: [("'Field.", 'Field.')] WMH19061107-V04-43-page4.txt: [("'of", 'of')] WMH19061114-V04-44-page1.txt: [("'stumps", 'stumps'), ("'illl", 'illl')] WMH19061114-V04-44-page2.txt: [("'W.", 'W.')] WMH19061114-V04-44-page3.txt: [("'Field.", 'Field.'), ("'for", 'for')] WMH19061114-V04-44-page4.txt: [("'Young", 'Young')] WMH19061121-V04-45-page1.txt: [("'Ole", 'Ole'), ("'and", 'and')] WMH19061121-V04-45-page2.txt: [("'first", 'first')] WMH19061121-V04-45-page3.txt: [("'and", 'and')] WMH19061121-V04-45-page4.txt: [("'now", 'now')] WMH19061128-V04-46-page1.txt: [("'Nur", 'Nur'), ("'SEGO", 'SEGO')] WMH19061128-V04-46-page3.txt: [("'has", 'has')] WMH19061205-V04-47-page2.txt: [("'desire", 'desire'), ("'Your", 'Your')] WMH19061205-V04-47-page3.txt: [("'West", 'West'), ("'Field.", 'Field.')] WMH19061212-V04-48-page2.txt: [("'field.", 'field.')] WMH19061219-V04-49-page1.txt: [("'....", '....'), ("'necessities", 'necessities')] WMH19061226-V04-50-page2.txt: [("'Field.", 'Field.')] WMH19070102-V05-01-page3.txt: [("'are", 'are')] WMH19070102-V05-01-page4.txt: [("'power.", 'power.')] WMH19070109-V05-02-page1.txt: [("'but", 'but')] WMH19070116-V05-03-page3.txt: [("'God", 'God')] WMH19070116-V05-03-page4.txt: [("'of", 'of'), ("'field.", 'field.')] WMH19070123-V05-04-page2.txt: [("'rrufant", 'rrufant'), ("'never", 'never'), ("'any", 'any')] WMH19070130-V05-05-page1.txt: [("'as", 'as')] WMH19070130-V05-05-page2.txt: [("'educational", 'educational')] WMH19070130-V05-05-page3.txt: [("'us", 'us'), ("'that", 'that')] WMH19070206-V05-06-page1.txt: [('\'"', '"'), ("'last", 'last')] WMH19070206-V05-06-page2.txt: [("'Financial.", 'Financial.')] WMH19070206-V05-06-page4.txt: [("'as", 'as')] WMH19070213-V05-07-page2.txt: [("'dollars", 'dollars')] WMH19070220-V05-08-page1.txt: [("'orders", 'orders')] WMH19070220-V05-08-page3.txt: [("'for", 'for')] WMH19070227-V05-09-page1.txt: [("'rent", 'rent'), ("'o", 'o')] WMH19070227-V05-09-page2.txt: [("'o", 'o')] WMH19070306-V05-10-page1.txt: [("'Woe", 'Woe')] WMH19070320-V05-12-page2.txt: [("'In", 'In')] WMH19070320-V05-12-page3.txt: [("'Financial.", 'Financial.'), ("'Battle", 'Battle')] WMH19070327-V05-13-page1.txt: [("'what", 'what')] WMH19070327-V05-13-page2.txt: [("'endorsed", 'endorsed'), ("'cOi", 'cOi'), ("'Field.", 'Field.'), ("'Come", 'Come')] WMH19070327-V05-13-page3.txt: [("'March", 'March'), ("'consideration", 'consideration')] WMH19070327-V05-13-page4.txt: [("'Freemont", 'Freemont')] WMH19070417-V05-16-page2.txt: [("'the", 'the'), ("'that", 'that')] WMH19070417-V05-16-page3.txt: [("'Field.", 'Field.')] WMH19070424-V05-17-page1.txt: [("'enemy", 'enemy')] WMH19070424-V05-17-page2.txt: [("'foi", 'foi')] WMH19070424-V05-17-page3.txt: [("'or", 'or'), ("'field.", 'field.')] WMH19070424-V05-17-page4.txt: [("'for", 'for')] WMH19070501-V05-18-page1.txt: [("'Wm", 'Wm')] WMH19070501-V05-18-page2.txt: [("'knew", 'knew')] WMH19070501-V05-18-page3.txt: [("'say", 'say'), ("'the", 'the'), ("'field.", 'field.')] WMH19070501-V05-18-page4.txt: [("'Twould", 'Twould'), ("'read", 'read'), ("'I", 'I')] WMH19070508-V05-19-page1.txt: [("'the", 'the')] WMH19070515-V05-20-page1.txt: [("'vat", 'vat'), ("'Twill", 'Twill'), ("'twill", 'twill')] WMH19070515-V05-20-page2.txt: [("'God", 'God'), ("'Third", 'Third')] WMH19070522-V05-21-page3.txt: [("'Field.", 'Field.')] WMH19070529-V05-22-page1.txt: [("'Tis", 'Tis')] WMH19070529-V05-22-page2.txt: [("'Field.", 'Field.')] WMH19070529-V05-22-page4.txt: [("'Jet", 'Jet')] WMH19070605-V05-23-page1.txt: [("'y", 'y')] WMH19070605-V05-23-page4.txt: [("'field.", 'field.')] WMH19070612-V05-24-page2.txt: [("'the", 'the'), ("'field.", 'field.')] WMH19070619-V05-25-page1.txt: [("'handle", 'handle'), ("'E.", 'E.')] WMH19070619-V05-25-page3.txt: [("'Field.", 'Field.')] WMH19070626-V05-26-page3.txt: [("'Field.", 'Field.'), ("'prevails.", 'prevails.')] WMH19070703-V05-27-page4.txt: [("'depot", 'depot')] WMH19070717-V05-29-page4.txt: [("'after", 'after'), ("'the", 'the'), ("'opened", 'opened')] WMH19070724-V05-30-page1.txt: [("'m", 'm')] WMH19070724-V05-30-page3.txt: [("'vas", 'vas'), ("'appear", 'appear'), ("'inspiring", 'inspiring')] WMH19070731-V05-31-page1.txt: [("'Y", 'Y'), ("'children", 'children')] WMH19070731-V05-31-page3.txt: [("'Ontario", 'Ontario')] WMH19070807-V05-32-page2.txt: [("'the", 'the')] WMH19070807-V05-32-page3.txt: [("'a", 'a')] WMH19070807-V05-32-page4.txt: [("'cello", 'cello'), ("'Field.", 'Field.')] WMH19070814-V05-33-page3.txt: [("'friend", 'friend')] WMH19070828-V05-34-page2.txt: [("'men", 'men')] WMH19070828-V05-34-page3.txt: [("'Field.", 'Field.'), ("'A", 'A'), ("'appreciate", 'appreciate')] WMH19070904-V05-35-page1.txt: [("'how", 'how'), ("'employ", 'employ'), ("'our", 'our'), ("'a", 'a')] WMH19070904-V05-35-page2.txt: [("'the", 'the'), ("'the", 'the'), ("'Do", 'Do')] WMH19070911-V05-36-page1.txt: [("'Rrimr", 'Rrimr'), ("'classes", 'classes')] WMH19070911-V05-36-page2.txt: [("'any", 'any')] WMH19070911-V05-36-page4.txt: [("'the", 'the'), ("'Financial.", 'Financial.'), ("'Iowa", 'Iowa')] WMH19070918-V05-37-page4.txt: [("'would", 'would'), ("'except", 'except')] WMH19070925-V05-38-page2.txt: [("'dollars", 'dollars')] WMH19071002-V05-39-page2.txt: [("'s", 's')] WMH19071002-V05-39-page4.txt: [("'twixt", 'twixt'), ("'tis", 'tis'), ("'twixt", 'twixt')] WMH19071009-V05-40-page2.txt: [("'Through", 'Through'), ("'emit", 'emit')] WMH19071016-V05-41-page1.txt: [("'.", '.')] WMH19071016-V05-41-page3.txt: [("'faith", 'faith')] WMH19071023-V05-42-page1.txt: [("'''o", 'o')] WMH19071030-V05-43-page1.txt: [("'to", 'to')] WMH19071030-V05-43-page2.txt: [("'Reading", 'Reading')] WMH19071030-V05-43-page3.txt: [("'until", 'until')] WMH19071106-V05-44-page1.txt: [("'-", '-')] WMH19071106-V05-44-page3.txt: [("'beginning", 'beginning'), ("'created", 'created'), ("'form", 'form'), ("'void", 'void'), ("'firmament", 'firmament'), ("'Let", 'Let'), ("'fruit", 'fruit'), ("'signs", 'signs'), ("'seasons", 'seasons')] WMH19071113-V05-45-page2.txt: [("'There", 'There')] WMH19071113-V05-45-page3.txt: [("'Trunk's", 'Trunks')] WMH19071113-V05-45-page4.txt: [("'phones", 'phones'), ("'phone", 'phone')] WMH19071120-V05-46-page1.txt: [("'Kings", 'Kings')] WMH19071127-V05-47-page1.txt: [("'The", 'The'), ("'It", 'It')] WMH19071127-V05-47-page3.txt: [("'now", 'now'), ("'people", 'people'), ("'it", 'it')] WMH19071204-V05-48-page1.txt: [("'s", 's')] WMH19071211-V05-49-page1.txt: [("'plEit", 'plEit'), ("'air.", 'air.')] WMH19071211-V05-49-page2.txt: [("'especially", 'especially'), ("'made", 'made')] WMH19080101-V06-01-page1.txt: [("'.", '.'), ("'Lis", 'Lis'), ("'Tls", 'Tls')] WMH19080101-V06-01-page3.txt: [("'field", 'field')] WMH19080101-V06-01-page4.txt: [("'SIGNS", 'SIGNS'), ("'fifty", 'fifty'), ("'Our", 'Our')] WMH19080108-V06-02-page2.txt: [("'Field.", 'Field.')] WMH19080108-V06-02-page3.txt: [("'shall", 'shall'), ("'be", 'be'), ("'sick", 'sick')] WMH19080115-V06-03-page3.txt: [("'Field.", 'Field.'), ("'o", 'o'), ("'Michigan", 'Michigan')] WMH19080122-V06-04-page1.txt: [("'Field.", 'Field.'), ("'III", 'III')] WMH19080122-V06-04-page2.txt: [("'disease", 'disease')] WMH19080129-V06-05-page1.txt: [("'AdVq", 'AdVq'), ("'tis", 'tis'), ("'greater", 'greater')] WMH19080129-V06-05-page2.txt: [("'Field.", 'Field.'), ("'see", 'see')] WMH19080129-V06-05-page3.txt: [("'Christian", 'Christian')] WMH19080205-V06-06-page1.txt: [("'financial.", 'financial.'), ("'I", 'I')] WMH19080205-V06-06-page4.txt: [("'once", 'once'), ("'success", 'success')] WMH19080212-V06-07-page1.txt: [("'Arehk", 'Arehk'), ("'Atlanta", 'Atlanta'), ("'Iowa", 'Iowa')] WMH19080212-V06-07-page3.txt: [("'Vaunt", 'Vaunt')] WMH19080212-V06-07-page4.txt: [("'as", 'as'), ("'tis", 'tis')] WMH19080219-V06-08-page2.txt: [("'RUSSELL.", 'RUSSELL.')] WMH19080219-V06-08-page4.txt: [("'this", 'this')] WMH19080226-V06-09-page3.txt: [("'without", 'without')] WMH19080226-V06-09-page4.txt: [("'to", 'to')] WMH19080304-V06-10-page1.txt: [("'gave", 'gave')] WMH19080304-V06-10-page2.txt: [("'Field.", 'Field.')] WMH19080304-V06-10-page3.txt: [("'a", 'a')] WMH19080311-V06-11-page1.txt: [("'reflected", 'reflected'), ("'be", 'be')] WMH19080311-V06-11-page3.txt: [("'Field.", 'Field.')] WMH19080318-V06-12-page1.txt: [("'.", '.')] WMH19080318-V06-12-page2.txt: [("'and", 'and')] WMH19080325-V06-13-page2.txt: [("'spoke", 'spoke'), ("'our", 'our'), ("'minds", 'minds')] WMH19080325-V06-13-page3.txt: [("'Field.", 'Field.'), ("'earnestly", 'earnestly')] WMH19080325-V06-13-page4.txt: [("'papers", 'papers')] WMH19080401-V06-14-page1.txt: [("'in", 'in'), ("'I", 'I')] WMH19080408-V06-15-page1.txt: [("'the", 'the'), ("'President", 'President')] WMH19080408-V06-15-page3.txt: [("'something", 'something')] WMH19080415-V06-16-page1.txt: [("'with", 'with'), ("'E.", 'E.')] WMH19080415-V06-16-page3.txt: [("'the", 'the'), ("'ii", 'ii')] WMH19080422-V06-17-page1.txt: [("'a", 'a'), ("'at", 'at')] WMH19080422-V06-17-page2.txt: [("'the", 'the'), ("'Financial", 'Financial')] WMH19080422-V06-17-page3.txt: [("'Field.", 'Field.')] WMH19080429-V06-18-page1.txt: [("'a", 'a')] WMH19080429-V06-18-page2.txt: [("'race", 'race')] WMH19080429-V06-18-page3.txt: [("'Field.", 'Field.'), ("'to", 'to'), ("'tention", 'tention'), ("'cause", 'cause')] WMH19080506-V06-19-page1.txt: [("'III", 'III')] WMH19080506-V06-19-page2.txt: [("'more", 'more')] WMH19080506-V06-19-page3.txt: [("'the", 'the')] WMH19080513-V06-20-page3.txt: [("'be", 'be')] WMH19080520-V06-21-page1.txt: [("'A", 'A'), ("'in", 'in'), ("'s", 's')] WMH19080520-V06-21-page3.txt: [("'circle", 'circle'), ("'by", 'by')] WMH19080520-V06-21-page4.txt: [("'them", 'them')] WMH19080527-V06-22-page2.txt: [("'to", 'to'), ("'and", 'and'), ("'therefore", 'therefore')] WMH19080527-V06-22-page3.txt: [("'Field.", 'Field.'), ("'occupying", 'occupying'), ("'our", 'our'), ("'thus", 'thus')] WMH19080603-V06-23-page1.txt: [("'these", 'these'), ("'do", 'do')] WMH19080603-V06-23-page2.txt: [("'Field.", 'Field.'), ("'Essay", 'Essay'), ("'hand", 'hand')] WMH19080603-V06-23-page3.txt: [("'to", 'to')] WMH19080603-V06-23-page4.txt: [("'so", 'so'), ("'Walter", 'Walter')] WMH19080610-V06-24-page1.txt: [("'d", 'd')] WMH19080610-V06-24-page2.txt: [("'They", 'They')] WMH19080610-V06-24-page3.txt: [("'consecrating", 'consecrating'), ("'for", 'for')] WMH19080610-V06-24-page4.txt: [("'of", 'of')] WMH19080617-V06-25-page1.txt: [("'.i", '.i')] WMH19080617-V06-25-page2.txt: [("'We", 'We')] WMH19080617-V06-25-page3.txt: [("'in", 'in'), ("'Shelby", 'Shelby')] WMH19080617-V06-25-page4.txt: [("'been", 'been'), ("'Liberty", 'Liberty')] WMH19080624-V06-26-page1.txt: [("'much", 'much')] WMH19080624-V06-26-page2.txt: [("'to", 'to'), ("'o", 'o')] WMH19080624-V06-26-page3.txt: [("'G.", 'G.')] WMH19080624-V06-26-page4.txt: [("'Canadian", 'Canadian')] WMH19080701-V06-27-page2.txt: [("'case", 'case'), ("'I", 'I'), ("'on", 'on'), ("'hindrance", 'hindrance'), ("'tis", 'tis'), ("'and", 'and')] WMH19080701-V06-27-page3.txt: [("'Fake", 'Fake'), ("'Under", 'Under')] WMH19080708-V06-28-page1.txt: [("'vt", 'vt'), ("'God's", 'Gods'), ("'funds", 'funds'), ("'bath", 'bath')] WMH19080708-V06-28-page2.txt: [("'month", 'month')] WMH19080715-V06-29-page1.txt: [("'to", 'to'), ("'it", 'it')] WMH19080715-V06-29-page2.txt: [("'Yes", 'Yes'), ("'Why", 'Why'), ("'the", 'the'), ("'I", 'I')] WMH19080715-V06-29-page3.txt: [("'Some", 'Some'), ("'congregation", 'congregation')] WMH19080722-V06-30-page1.txt: [("'E.", 'E.')] WMH19080722-V06-30-page2.txt: [("'this", 'this')] WMH19080722-V06-30-page3.txt: [("'shown", 'shown')] WMH19080729-V06-31-page2.txt: [("'patients", 'patients'), ("'will", 'will')] WMH19080729-V06-31-page4.txt: [("'I", 'I'), ("'Sound", 'Sound'), ("'Elder", 'Elder')] WMH19080805-V06-32-page1.txt: [("'of", 'of')] WMH19080805-V06-32-page4.txt: [("'city", 'city'), ("'exercises", 'exercises'), ("'in", 'in'), ("'On", 'On')] WMH19080812-V06-33-page2.txt: [("'years", 'years')] WMH19080812-V06-33-page3.txt: [("'Hew", 'Hew'), ("'Take", 'Take')] WMH19080826-V06-34-page1.txt: [("'an", 'an'), ("'decree", 'decree'), ("'round", 'round'), ("'Twas", 'Twas')] WMH19080826-V06-34-page4.txt: [("'get", 'get'), ("'prepare", 'prepare')] WMH19080902-V06-35-page1.txt: [("'them", 'them'), ("'must", 'must'), ("'study", 'study'), ("'Neath", 'Neath')] WMH19080902-V06-35-page2.txt: [("'entire", 'entire'), ("'o", 'o'), ("'became", 'became')] WMH19080902-V06-35-page4.txt: [("'happiness.", 'happiness.')] WMH19080909-V06-36-page1.txt: [("'kingdom", 'kingdom')] WMH19080909-V06-36-page3.txt: [("'Financial", 'Financial')] WMH19080916-V06-37-page2.txt: [("'company", 'company')] WMH19080916-V06-37-page3.txt: [("'Men", 'Men'), ("'so", 'so')] WMH19080916-V06-37-page4.txt: [("'icopies", 'icopies'), ("'is", 'is')] WMH19080923-V06-38-page1.txt: [("'acknowledge", 'acknowledge')] WMH19080923-V06-38-page3.txt: [("'there", 'there')] WMH19080923-V06-38-page4.txt: [("'It", 'It')] WMH19080930-V06-39-page1.txt: [("'SABBATH", 'SABBATH')] WMH19081007-V06-40-page1.txt: [("'OW", 'OW')] WMH19081007-V06-40-page2.txt: [("'Tis", 'Tis'), ("'of", 'of')] WMH19081007-V06-40-page3.txt: [("'a", 'a'), ("'o", 'o'), ("'Well", 'Well'), ("'I", 'I')] WMH19081014-V06-41-page2.txt: [("'and", 'and'), ("'who", 'who')] WMH19081021-V06-42-page1.txt: [("'unless", 'unless'), ("'missions.", 'missions.')] WMH19081021-V06-42-page2.txt: [("'that", 'that'), ("'Abstain", 'Abstain')] WMH19081021-V06-42-page3.txt: [("'years", 'years')] WMH19081021-V06-42-page4.txt: [("'toward", 'toward')] WMH19081028-V06-43-page1.txt: [("'was", 'was')] WMH19081028-V06-43-page3.txt: [("'there", 'there')] WMH19081028-V06-43-page4.txt: [("'subscribers", 'subscribers')] WMH19081104-V06-44-page1.txt: [("'tithe", 'tithe'), ("'the", 'the'), ("'graves", 'graves')] WMH19081104-V06-44-page2.txt: [("'.be", '.be'), ("'Falls", 'Falls'), ("'to", 'to'), ("'is", 'is')] WMH19081104-V06-44-page4.txt: [("'liberally", 'liberally')] WMH19081111-V06-45-page1.txt: [("'tives", 'tives'), ("'summer", 'summer')] WMH19081111-V06-45-page3.txt: [("'Genesis", 'Genesis'), ("'separated", 'separated'), ("'lights", 'lights'), ("'beginning", 'beginning'), ("'created", 'created'), ("'signs", 'signs'), ("'form", 'form'), ("'firmament", 'firmament'), ("'Heaven.", 'Heaven.'), ("'seasons", 'seasons')]
In [36]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/correction4 Average verified rate: 0.9771874993174814 Average of error rates: 0.024615532118887822 Total token count: 915726
In [37]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[37]:
[('m', 1750), ('w', 1506), ('g', 1439), ('e', 1317), ('d', 1279), ('r', 688), ('n', 645), ("'", 490), ('f', 446), ('t', 384), ('th', 283), ('co', 172), ('oo', 171), ('sabbathschool', 163), ('io', 120), ('mt', 108), ('k', 107), ('ro', 96), ('wm', 83), ('numbess', 75), ('re', 71), ('u', 70), ("canvassers'", 58), ('x', 46), ('horr', 39), ("the'", 38), ('rd', 33), ('blendon', 32), ('ex', 32), ('brower', 31), ('harnden', 30), ("f'd", 30), ('mchugh', 29), ('seventhday', 28), ('nd', 28), ('cleora', 27), ('tion', 25), ('nunica', 23), ('sabbathschools', 23), ('q', 23), ('-', 22), ('vowyla', 21), ('al', 21), ('loth', 20), ('z', 20), ('fd', 20), ('michi', 20), ('psa', 20), ('ti', 20), ('ne', 19)]
Correction 5 -- Rejoin Split Words¶
In [39]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction5"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
WMH19030128-V01-04-page4.txt: [('co', 'operate')] WMH19030415-V01-15-page2.txt: [('IMPRES', 'SIONS')] WMH19030415-V01-15-page4.txt: [('Verm', 'on')] WMH19030506-V01-18-page3.txt: [('sugg', 'estion')] WMH19030513-V01-19-page3.txt: [('th', 'in')] WMH19030520-V01-20-page3.txt: [('co', 'operating')] WMH19030603-V01-22-page1.txt: [('TES', 'TAMENT')] WMH19030603-V01-22-page3.txt: [('AB', 'OLISHED')] WMH19030603-V01-22-page4.txt: [('th', 'at'), ('co', 'laborers')] WMH19030610-V01-23-page1.txt: [('pre', 'eminently')] WMH19030610-V01-23-page3.txt: [('unscript', 'ural')] WMH19030624-V01-25-page4.txt: [('Mc', 'Bride')] WMH19030715-V01-28-page1.txt: [('mo', 'at')] WMH19030715-V01-28-page2.txt: [('th', 'e')] WMH19030715-V01-28-page3.txt: [('wa', 's'), ('developmen', 't')] WMH19030722-V01-29-page3.txt: [('Kellog', 'g')] WMH19031028-V01-43-page1.txt: [('ca', 'use')] WMH19031118-V01-46-page1.txt: [('co', 'operate')] WMH19031118-V01-46-page3.txt: [('lig', 'and')] WMH19040113-V02-03-page3.txt: [('AC', 'CEPT')] WMH19040203-V02-05-page3.txt: [('GENER', 'AL')] WMH19040203-V02-05-page4.txt: [("Sailor'", 's')] WMH19040210-V02-06-page3.txt: [('co', 'operation')] WMH19040210-V02-06-page4.txt: [('Mc', 'Allister')] WMH19040309-V02-10-page1.txt: [('CO', 'OPERATE')] WMH19040323-V02-12-page1.txt: [('conven', 'tions')] WMH19040330-V02-13-page1.txt: [('RE', 'QUISITE'), ('ac', 'es')] WMH19040330-V02-13-page3.txt: [('bili', 'Ousness')] WMH19040406-V02-14-page3.txt: [('co', 'operate')] WMH19040413-V02-15-page3.txt: [('Dimonda', 'le')] WMH19040420-V02-16-page1.txt: [('treasur', 'ers')] WMH19040427-V02-17-page3.txt: [('co', 'workers')] WMH19040427-V02-17-page4.txt: [('ti', 'the')] WMH19040608-V02-23-page3.txt: [('co', 'operation')] WMH19040608-V02-23-page4.txt: [('re', 'employ'), ('Scandina', 'vian')] WMH19040629-V02-25-page3.txt: [('co', 'workers')] WMH19040629-V02-25-page4.txt: [('th', 'at')] WMH19040706-V02-26-page2.txt: [('institu', 'tion')] WMH19040706-V02-26-page3.txt: [('re', 'turn')] WMH19040713-V02-27-page1.txt: [('CIRCUM', 'STANCES'), ('DEFI', 'NITELY'), ('Vo', 'L')] WMH19040720-V02-28-page1.txt: [('re', 'a')] WMH19040727-V02-29-page1.txt: [('re', 'checking')] WMH19040803-V02-30-page2.txt: [('re', 'hired')] WMH19040803-V02-30-page3.txt: [('re', 'engaged')] WMH19040817-V02-32-page2.txt: [('io', 'n')] WMH19040914-V02-34-page1.txt: [('co', 'laborers')] WMH19040914-V02-34-page3.txt: [('re', 'elected')] WMH19040921-V02-34a-page1.txt: [('Io', 'was')] WMH19040921-V02-34a-page2.txt: [('Responsib', 'ility')] WMH19040928-V02-35-page1.txt: [('inacces', 'sible')] WMH19040928-V02-35-page4.txt: [('re', 'opens')] WMH19041005-V02-36-page4.txt: [('corre', 'late')] WMH19041019-V02-38-page4.txt: [('Sabbath-', 'school')] WMH19041026-V02-39-page2.txt: [('PEO', 'PLE')] WMH19041026-V02-39-page3.txt: [('PRE', 'SENT')] WMH19041026-V02-39-page4.txt: [('co', 'laborer')] WMH19041102-V02-40-page3.txt: [('connec', 'tion'), ('co', 'workers')] WMH19041102-V02-40-page4.txt: [('Gei', 'sel')] WMH19041123-V02-43-page1.txt: [('sca', 't')] WMH19041123-V02-43-page4.txt: [('ti', 'e'), ('co', 'operate')] WMH19041130-V02-44-page3.txt: [('co', 'operation')] WMH19041207-V02-45-page4.txt: [('reabsorp', 'tion')] WMH19041221-V02-47-page3.txt: [('EDI', 'TION'), ('NEC', 'ESSARILY')] WMH19041228-V02-48-page1.txt: [('re', 'counting')] WMH19041228-V02-48-page2.txt: [('re', 'consecrated')] WMH19050104-V03-01-page2.txt: [('exul', 'tantly')] WMH19050111-V03-02-page2.txt: [('co', 'operate')] WMH19050118-V03-03-page3.txt: [('Ob', 'ject')] WMH19050201-V03-04-page2.txt: [('ble', 'ssed')] WMH19050201-V03-04-page4.txt: [('al', 'a')] WMH19050208-V03-05-page1.txt: [('Cre', 'W')] WMH19050215-V03-06-page4.txt: [('re', 'mains')] WMH19050222-V03-07-page2.txt: [('co', 'operation')] WMH19050222-V03-07-page3.txt: [('co', 'operate')] WMH19050301-V03-08-page1.txt: [('Mc', 'Curdy')] WMH19050322-V03-11-page3.txt: [('greate', 'r')] WMH19050322-V03-11-page4.txt: [('church-s', 'chool')] WMH19050405-V03-13-page2.txt: [('humani', 'ty')] WMH19050413-V03-14-page3.txt: [('GIV', 'EN')] WMH19050419-V03-15-page1.txt: [('co', 'operation')] WMH19050503-V03-17-page4.txt: [('un', 'able')] WMH19050510-V03-18-page4.txt: [('increas', 'ing'), ('ro', 'o')] WMH19050517-V03-19-page1.txt: [('co', 'operation')] WMH19050517-V03-19-page3.txt: [('Vermontvi', 'lle')] WMH19050524-V03-20-page4.txt: [('glor', 'ious')] WMH19050531-V03-21-page3.txt: [('HOFST', 'RA')] WMH19050531-V03-21-page4.txt: [('vis', 'ited'), ('se', 'cure')] WMH19050607-V03-22-page3.txt: [('pu', 'pils'), ('co', 'operation')] WMH19050614-V03-23-page1.txt: [('ex', 'penses')] WMH19050614-V03-23-page3.txt: [('Educa', 'tion')] WMH19050614-V03-23-page4.txt: [('co', 'operation')] WMH19050621-V03-24-page3.txt: [('re', 'echoed')] WMH19050628-V03-25-page1.txt: [('re', 'established')] WMH19050705-V03-26-page2.txt: [('co', 'operation')] WMH19050705-V03-26-page3.txt: [('soci', 'eties')] WMH19050712-V03-27-page1.txt: [('re', 'No')] WMH19050712-V03-27-page4.txt: [('Michi', 'gan')] WMH19050719-V03-28-page3.txt: [('fi', 'st')] WMH19050726-V03-29-page1.txt: [('ro', 'o')] WMH19050802-V03-30-page4.txt: [('co', 'operate')] WMH19050809-V03-31-page2.txt: [('co', 'operate')] WMH19050816-V03-32-page1.txt: [('ob', 'ject')] WMH19050830-V03-33-page3.txt: [('ap', 'plicable')] WMH19050906-V03-34-page3.txt: [('pl', 'acidity')] WMH19050906-V03-34-page4.txt: [('Wednes', 'day')] WMH19050920-V03-36-page2.txt: [('th', 'under'), ('io', 'was')] WMH19050920-V03-36-page4.txt: [('re', 'elected')] WMH19051004-V03-38-page3.txt: [('coun', 'sel')] WMH19051018-V03-40-page2.txt: [('ac', 'cepted')] WMH19051101-V03-42-page1.txt: [('TI', 'e')] WMH19051101-V03-42-page3.txt: [('suf', 'ferers')] WMH19051101-V03-42-page4.txt: [('MESSEN', 'GER'), ('Ne', 'braska')] WMH19051108-V03-43-page1.txt: [('ro', 'o')] WMH19051108-V03-43-page3.txt: [('re', 'port')] WMH19051122-V03-45-page2.txt: [('co', 'operate')] WMH19051129-V03-46-page2.txt: [('co', 'operation')] WMH19051206-V03-47-page1.txt: [('io', 'n')] WMH19051206-V03-47-page2.txt: [('ment', 'on')] WMH19051206-V03-47-page4.txt: [('re', 'organized')] WMH19051213-V03-48-page2.txt: [('th', 'a')] WMH19051213-V03-48-page3.txt: [('Ti', 'the'), ('re', 'vived'), ('ac', 'complished')] WMH19051220-V03-49-page1.txt: [('swi', 'ft'), ('co', 'operation')] WMH19051220-V03-49-page2.txt: [('G.', ''), ('co', 'operation'), ('CO', 'OPERATION')] WMH19051220-V03-49-page3.txt: [('peo', 'ple')] WMH19051227-V03-50-page1.txt: [('io', 'n')] WMH19051227-V03-50-page2.txt: [('co', 'operation')] WMH19060103-V04-01-page4.txt: [('pa', 'per'), ('giv', 'ing')] WMH19060110-V04-02-page1.txt: [('re', 'acting')] WMH19060110-V04-02-page2.txt: [('ro', 'o')] WMH19060124-V04-04-page2.txt: [('INCORPO', 'RATED')] WMH19060124-V04-04-page3.txt: [('co', 'operating'), ('remem', 'bereth')] WMH19060131-V04-05-page2.txt: [('Mis', 'o'), ('ro', 'o')] WMH19060207-V04-06-page1.txt: [('Pr', 'esident'), ('COLPORTE', 'UR'), ('ac', 'cepted')] WMH19060214-V04-07-page2.txt: [('Smi', 'th'), ('ro', 'o')] WMH19060221-V04-08-page1.txt: [('co', 'operation')] WMH19060221-V04-08-page2.txt: [('Treasur', 'er')] WMH19060228-V04-09-page1.txt: [('Janu', 'ary'), ('co', 'operation')] WMH19060228-V04-09-page2.txt: [('soci', 'eties')] WMH19060307-V04-10-page3.txt: [('co', 'operation')] WMH19060314-V04-11-page1.txt: [('co', 'operation'), ('ite', 'm')] WMH19060314-V04-11-page3.txt: [('co', 'operation'), ('Pa', 'w')] WMH19060314-V04-11-page4.txt: [('MICHI', 'GAN')] WMH19060321-V04-12-page1.txt: [('temporari', 'ly')] WMH19060321-V04-12-page3.txt: [('Pa', 'w')] WMH19060328-V04-13-page1.txt: [('co', 'operation')] WMH19060328-V04-13-page4.txt: [('suf', 'fering')] WMH19060411-V04-15-page1.txt: [('co', 'operation')] WMH19060411-V04-15-page2.txt: [('ro', 'o')] WMH19060425-V04-17-page1.txt: [('institu', 'tions')] WMH19060502-V04-18-page1.txt: [("Sec'", 'y')] WMH19060502-V04-18-page3.txt: [('ARBEI', 'TER')] WMH19060509-V04-19-page2.txt: [('PROPH', 'ECY')] WMH19060523-V04-20-page2.txt: [('appropria', 'tion')] WMH19060523-V04-20-page3.txt: [('th', 'a')] WMH19060606-V04-22-page2.txt: [('re', 'establish')] WMH19060613-V04-23-page3.txt: [('fl', 'o')] WMH19060613-V04-23-page4.txt: [('ES', 'SENTIAL')] WMH19060620-V04-24-page2.txt: [('RE', 'PENTED')] WMH19060620-V04-24-page4.txt: [('ut', 'A'), ('recitati', 'on')] WMH19060627-V04-25-page4.txt: [('ro', 'o')] WMH19060704-V04-26-page1.txt: [('ex', 'ample'), ('li', 'i')] WMH19060711-V04-27-page2.txt: [('ro', 'Jo')] WMH19060711-V04-27-page3.txt: [('Re', 'populated'), ('re', 'populated')] WMH19060718-V04-28-page1.txt: [('mo', 'I')] WMH19060718-V04-28-page2.txt: [('co', 'operation')] WMH19060718-V04-28-page4.txt: [('appe', 'tites')] WMH19060801-V04-30-page3.txt: [('ap', 'plications')] WMH19060808-V04-31-page1.txt: [('APPRO', 'PRIATED'), ('gra', 'ger')] WMH19060808-V04-31-page2.txt: [('th', 'at'), ('co', 'operation')] WMH19060822-V04-32-page3.txt: [('re', 'assure')] WMH19060905-V04-34-page2.txt: [('SPE', 'CIFIC')] WMH19060912-V04-35-page3.txt: [('re', 'turn')] WMH19061010-V04-39-page1.txt: [('re', 'locate')] WMH19061017-V04-40-page1.txt: [('TA', 'RE')] WMH19061017-V04-40-page3.txt: [('es', 'sential')] WMH19061024-V04-41-page3.txt: [('hov', 'els')] WMH19061031-V04-42-page1.txt: [('INTERNA', 'TIONAL')] WMH19061107-V04-43-page1.txt: [('Wr', 'IST')] WMH19061114-V04-44-page3.txt: [('submerg', 'ed')] WMH19061128-V04-46-page1.txt: [('whi', 'le'), ('co', 'operation')] WMH19061128-V04-46-page4.txt: [('co', 'operate')] WMH19061205-V04-47-page1.txt: [('moun', 'tains')] WMH19061226-V04-50-page1.txt: [('municipali', 'ty')] WMH19061226-V04-50-page3.txt: [('consi', 'dered')] WMH19070102-V05-01-page1.txt: [('confere', 'e')] WMH19070102-V05-01-page2.txt: [('Scandi', 'navian'), ('educa', 'tional')] WMH19070109-V05-02-page1.txt: [('co', 'operate'), ('desti', 'tute')] WMH19070109-V05-02-page3.txt: [('Meri', 'dian')] WMH19070116-V05-03-page2.txt: [('resum', 'ing')] WMH19070116-V05-03-page3.txt: [('founda', 'tion')] WMH19070123-V05-04-page1.txt: [('Ti', 'the')] WMH19070123-V05-04-page3.txt: [('secur', 'ing'), ('Co', 'operation'), ('co', 'operation'), ('includ', 'ing')] WMH19070130-V05-05-page4.txt: [('RE', 'PORTS')] WMH19070206-V05-06-page4.txt: [('co', 'operate')] WMH19070213-V05-07-page2.txt: [('co', 'operate')] WMH19070227-V05-09-page2.txt: [('Ti', 'the')] WMH19070306-V05-10-page2.txt: [('requisi', 'tes')] WMH19070306-V05-10-page3.txt: [('thi', 'nking'), ('pre', 'sented')] WMH19070313-V05-11-page4.txt: [('es', 't')] WMH19070327-V05-13-page2.txt: [('disci', 'pline')] WMH19070327-V05-13-page3.txt: [('consci', 'entious')] WMH19070327-V05-13-page4.txt: [('RE', 'CEIVED')] WMH19070410-V05-15-page1.txt: [('Stockda', 'le'), ('identit', 'y')] WMH19070410-V05-15-page2.txt: [('Legis', 'lature')] WMH19070410-V05-15-page3.txt: [('co', 'operation')] WMH19070410-V05-15-page4.txt: [('Pottervi', 'lle')] WMH19070417-V05-16-page2.txt: [('re', 'quire')] WMH19070424-V05-17-page1.txt: [('peo', 'ple')] WMH19070424-V05-17-page3.txt: [('es', 'to')] WMH19070501-V05-18-page3.txt: [('accompl', 'ished')] WMH19070508-V05-19-page1.txt: [('Co', 'operate'), ('co', 'operate')] WMH19070515-V05-20-page2.txt: [('th', 'a')] WMH19070529-V05-22-page2.txt: [('ele', 'vated')] WMH19070612-V05-24-page1.txt: [('pA', 'w')] WMH19070619-V05-25-page3.txt: [('co', 'operation')] WMH19070703-V05-27-page3.txt: [('ca', 'm')] WMH19070703-V05-27-page4.txt: [('th', 'a')] WMH19070710-V05-28-page4.txt: [('th', 'e')] WMH19070731-V05-31-page1.txt: [('re', 'No')] WMH19070731-V05-31-page2.txt: [('re', 'opened')] WMH19070814-V05-33-page3.txt: [('mis', 'pronounces')] WMH19070918-V05-37-page4.txt: [('LAN', 'GUAGE')] WMH19070925-V05-38-page1.txt: [('re', 'No')] WMH19071002-V05-39-page1.txt: [('Sanitari', 'um'), ('re', 'locate'), ('larg', 'ely'), ('M.', '')] WMH19071002-V05-39-page4.txt: [('Whi', 'tmarsh')] WMH19071009-V05-40-page2.txt: [('enf', 'orced')] WMH19071016-V05-41-page1.txt: [('un', 'Christian')] WMH19071016-V05-41-page2.txt: [('attendi', 'ng')] WMH19071023-V05-42-page3.txt: [('co', 'worker')] WMH19071106-V05-44-page1.txt: [('co', 'operation')] WMH19071106-V05-44-page2.txt: [('DISESTAB', 'LISHED')] WMH19071106-V05-44-page4.txt: [('transla', 'tions')] WMH19071120-V05-46-page3.txt: [('re', 'enlisted')] WMH19071120-V05-46-page4.txt: [('bi', 'nary')] WMH19071127-V05-47-page3.txt: [('RE', 'VIEW')] WMH19071211-V05-49-page1.txt: [('Sa', 'tan')] WMH19071211-V05-49-page4.txt: [('Ti', 'the')] WMH19071218-V05-50-page1.txt: [('Notwith', 'standing')] WMH19071218-V05-50-page3.txt: [('criti', 'cising'), ('recom', 'mendation')] WMH19080101-V06-01-page4.txt: [('Otseg', 'o')] WMH19080108-V06-02-page2.txt: [('AMA', 'DON')] WMH19080108-V06-02-page4.txt: [('th', 'at')] WMH19080115-V06-03-page2.txt: [('re', 'enact')] WMH19080115-V06-03-page3.txt: [('distri', 'bution'), ('ro', 'o')] WMH19080122-V06-04-page4.txt: [('Wellspri', 'ng')] WMH19080129-V06-05-page3.txt: [('co', 'operates')] WMH19080205-V06-06-page3.txt: [('counte', 'nance'), ('Brot', 'her')] WMH19080212-V06-07-page1.txt: [('Sa', 'bbath')] WMH19080212-V06-07-page4.txt: [('Se', 'Lected')] WMH19080219-V06-08-page2.txt: [('Ay', 'ars')] WMH19080226-V06-09-page1.txt: [('co', 'operate')] WMH19080226-V06-09-page2.txt: [('commi', 'ttees')] WMH19080304-V06-10-page1.txt: [('tA', 'M')] WMH19080304-V06-10-page4.txt: [('discipl', 'ine'), ('un', 'planned')] WMH19080311-V06-11-page1.txt: [('famil', 'iar')] WMH19080311-V06-11-page4.txt: [('co', 'operation')] WMH19080325-V06-13-page1.txt: [('re', 'read'), ('Ev', 'a')] WMH19080415-V06-16-page1.txt: [('EDUCATIONA', 'L'), ('Mc', 'Reynolds')] WMH19080415-V06-16-page4.txt: [('th', 'or')] WMH19080422-V06-17-page2.txt: [('Pottervi', 'lle')] WMH19080429-V06-18-page1.txt: [('co', 'operate')] WMH19080506-V06-19-page4.txt: [('ren', 'a')] WMH19080513-V06-20-page1.txt: [('mo', 'rA'), ('co', 'operate')] WMH19080513-V06-20-page2.txt: [('co', 'operation'), ('sor', 'a')] WMH19080603-V06-23-page1.txt: [('mechani', 'Cal')] WMH19080603-V06-23-page3.txt: [('co', 'operative')] WMH19080617-V06-25-page2.txt: [('includ', 'ing')] WMH19080701-V06-27-page1.txt: [('ex', 'pressive')] WMH19080701-V06-27-page2.txt: [('re', 'union')] WMH19080708-V06-28-page1.txt: [('FI', 'ELD')] WMH19080708-V06-28-page3.txt: [('retur', 'ned')] WMH19080715-V06-29-page1.txt: [('co', 'operation')] WMH19080715-V06-29-page3.txt: [('th', 'at')] WMH19080722-V06-30-page3.txt: [('ob', 'jections')] WMH19080722-V06-30-page4.txt: [('DEYOU', 'NG')] WMH19080729-V06-31-page1.txt: [('al', 'ways')] WMH19080805-V06-32-page2.txt: [('re', 'sided')] WMH19080812-V06-33-page3.txt: [('GENER', 'AL')] WMH19080909-V06-36-page2.txt: [('auspi', 'ces')] WMH19080916-V06-37-page2.txt: [('co', 'workers')] WMH19080923-V06-38-page2.txt: [('Educa', 'tional')] WMH19080930-V06-39-page1.txt: [('gi', 'a')] WMH19080930-V06-39-page2.txt: [('prepar', 'ed')] WMH19081007-V06-40-page4.txt: [('Al', 'ma')] WMH19081028-V06-43-page1.txt: [('gos', 'pel')] WMH19081028-V06-43-page2.txt: [('RE', 'QUIRED')] WMH19081104-V06-44-page4.txt: [('MICHI', 'GAN')] WMH19081111-V06-45-page1.txt: [('na', 'tives')]
In [42]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/correction5 Average verified rate: 0.9775893748620843 Average of error rates: 0.02419558964525408 Total token count: 915414
In [43]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[43]:
[('m', 1749), ('w', 1503), ('g', 1438), ('e', 1316), ('d', 1279), ('r', 687), ('n', 642), ("'", 490), ('f', 446), ('t', 381), ('th', 275), ('oo', 171), ('sabbathschool', 163), ('io', 115), ('mt', 108), ('k', 107), ('co', 101), ('ro', 96), ('wm', 83), ('numbess', 75), ('u', 70), ("canvassers'", 58), ('x', 46), ('horr', 39), ("the'", 38), ('rd', 33), ('blendon', 32), ('brower', 31), ('ex', 30), ('harnden', 30), ("f'd", 30), ('mchugh', 29), ('re', 29), ('seventhday', 28), ('nd', 28), ('cleora', 27), ('q', 23), ('nunica', 23), ('sabbathschools', 23), ('-', 22), ('tion', 21), ('vowyla', 21), ('fd', 20), ('psa', 20), ('z', 20), ('loth', 20), ('numbeps', 19), ('ioo', 19), ('drury', 18), ('hoffstra', 18)]
Correction 6 -- Rejoin Split Words II¶
In [45]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
WMH19030415-V01-15-page4.txt: [('Confer', 'ence'), ('Gene', 'al'), ('Vermon', 'tville')] WMH19030520-V01-20-page4.txt: [('depart', 'ment')] WMH19030603-V01-22-page2.txt: [('CON', 'FERENCE')] WMH19030701-V01-26-page1.txt: [('DEPART', 'MENT')] WMH19030722-V01-29-page1.txt: [('r', 'esented')] WMH19030722-V01-29-page4.txt: [('cam', 'pmeeting')] WMH19031118-V01-46-page1.txt: [('mission', 'ary')] WMH19031118-V01-46-page3.txt: [('in', 'stil')] WMH19040127-V02-04-page4.txt: [('improve', 'ments')] WMH19040203-V02-05-page4.txt: [('at', 'tention'), ('INSTRUCT', 'ORS')] WMH19040210-V02-06-page4.txt: [('San', 'itarium')] WMH19040224-V02-08-page1.txt: [('priv', 'ilege')] WMH19040224-V02-08-page4.txt: [('S', 'hool')] WMH19040302-V02-09-page2.txt: [('atone', 'ment')] WMH19040323-V02-12-page1.txt: [('in', 'terest')] WMH19040330-V02-13-page4.txt: [('maili', 'ng')] WMH19040413-V02-15-page3.txt: [('an', 'nouncing')] WMH19040413-V02-15-page4.txt: [('SOUTH', 'ERN')] WMH19040420-V02-16-page1.txt: [('treasur', 'ers')] WMH19040504-V02-18-page2.txt: [('m', 'ost')] WMH19040504-V02-18-page4.txt: [('kin', 'gdom')] WMH19040518-V02-20-page4.txt: [('per', 'sonal')] WMH19040608-V02-23-page4.txt: [('Sec', 'retary'), ('Comm', 'ittee')] WMH19040629-V02-25-page4.txt: [('T', 'oo')] WMH19040706-V02-26-page3.txt: [('world', 'liness')] WMH19040706-V02-26-page4.txt: [('Ed', 'itor')] WMH19040720-V02-28-page1.txt: [('rea', 'ppear'), ('re', 'ligious')] WMH19040803-V02-30-page4.txt: [('n', 'ay')] WMH19040810-V02-31-page2.txt: [('r', 'emove')] WMH19040817-V02-32-page1.txt: [('ha', 'lf')] WMH19040914-V02-34-page3.txt: [('Bes', 'sie')] WMH19041005-V02-36-page3.txt: [('THANK', 'FUL')] WMH19041005-V02-36-page4.txt: [('r', 'oo')] WMH19041019-V02-38-page4.txt: [('Bat', 'tle')] WMH19041026-V02-39-page2.txt: [('EN', 'GAGED')] WMH19041102-V02-40-page3.txt: [('connec', 'tion')] WMH19041109-V02-41-page1.txt: [('San', 'itarium')] WMH19041130-V02-44-page2.txt: [('a', 'ssociation')] WMH19041130-V02-44-page3.txt: [('depart', 'ment')] WMH19041207-V02-45-page2.txt: [('confer', 'ence')] WMH19041207-V02-45-page4.txt: [('elim', 'inated'), ('reabsorp', 'tion')] WMH19041221-V02-47-page3.txt: [('REG', 'ULAR')] WMH19041228-V02-48-page1.txt: [('and', 're')] WMH19050104-V03-01-page1.txt: [('to', 'ft')] WMH19050104-V03-01-page4.txt: [('t', 'wo')] WMH19050111-V03-02-page2.txt: [('Wash', 'ington')] WMH19050111-V03-02-page4.txt: [('faith', 'ful')] WMH19050201-V03-04-page3.txt: [('K', 'inderhook')] WMH19050222-V03-07-page3.txt: [('CAN', 'VASSERS')] WMH19050322-V03-11-page6.txt: [('A', 'nna'), ('vi', 'ne')] WMH19050405-V03-13-page1.txt: [('CON', 'SIDER')] WMH19050419-V03-15-page1.txt: [('confer', 'ence')] WMH19050419-V03-15-page3.txt: [('So', 'ciety')] WMH19050503-V03-17-page3.txt: [('par', 'ents')] WMH19050510-V03-18-page1.txt: [('teach', 'ers')] WMH19050517-V03-19-page4.txt: [('Confer', 'ence')] WMH19050531-V03-21-page3.txt: [('MICH', 'IGAN')] WMH19050614-V03-23-page1.txt: [('ex', 'penses')] WMH19050621-V03-24-page4.txt: [('E', 'ndureth')] WMH19050726-V03-29-page1.txt: [('Publ', 'ic')] WMH19050802-V03-30-page2.txt: [('deliver', 'ance')] WMH19050830-V03-33-page3.txt: [('ap', 'plicable')] WMH19050920-V03-36-page1.txt: [('V', 'ideto')] WMH19051004-V03-38-page1.txt: [('pro', 'phetic')] WMH19051004-V03-38-page3.txt: [('A', 'nd')] WMH19051025-V03-41-page1.txt: [('H', 'artwell')] WMH19051025-V03-41-page3.txt: [('pro', 'vides')] WMH19051025-V03-41-page4.txt: [('M', 'adison')] WMH19051129-V03-46-page1.txt: [('Ed', 'uc')] WMH19051129-V03-46-page2.txt: [('The', 're')] WMH19051129-V03-46-page3.txt: [('sol', 'emn')] WMH19051129-V03-46-page4.txt: [('LIT', 'TLE')] WMH19051213-V03-48-page2.txt: [('POT', 'TERVILLE')] WMH19051213-V03-48-page3.txt: [('re', 'vived')] WMH19060103-V04-01-page1.txt: [('The', 're')] WMH19060117-V04-03-page1.txt: [('Mon', 'tcalm')] WMH19060117-V04-03-page2.txt: [('CONFER', 'ENCE')] WMH19060124-V04-04-page1.txt: [('Bar', 'ry')] WMH19060131-V04-05-page1.txt: [('g', 'oo')] WMH19060131-V04-05-page2.txt: [('of', 'ficer')] WMH19060131-V04-05-page4.txt: [('so', 'journ')] WMH19060214-V04-07-page2.txt: [('o', 'ff')] WMH19060221-V04-08-page2.txt: [('Mar', 'garet')] WMH19060228-V04-09-page1.txt: [('to', 'co'), ('Janu', 'ary')] WMH19060307-V04-10-page1.txt: [('Com', 'mittee')] WMH19060307-V04-10-page3.txt: [('teach', "ers'")] WMH19060314-V04-11-page1.txt: [('to', 'co')] WMH19060321-V04-12-page1.txt: [('temporari', 'ly'), ('con', 'ference')] WMH19060411-V04-15-page1.txt: [('to', 'co'), ('GATH', 'ERETH')] WMH19060411-V04-15-page2.txt: [('o', 'ro')] WMH19060502-V04-18-page1.txt: [('r', 'INER')] WMH19060502-V04-18-page3.txt: [('arrange', 'ment')] WMH19060509-V04-19-page2.txt: [('state', 'ment')] WMH19060606-V04-22-page1.txt: [('or', 'dained')] WMH19060627-V04-25-page1.txt: [('O', 'ft')] WMH19060711-V04-27-page1.txt: [('r', 'Ef')] WMH19060711-V04-27-page2.txt: [('to', 'ro')] WMH19060711-V04-27-page4.txt: [('t', 'oo')] WMH19060725-V04-29-page2.txt: [('we', 're')] WMH19060725-V04-29-page3.txt: [('COL', 'LEGE')] WMH19060808-V04-31-page1.txt: [('CAN', 'VASSERS')] WMH19060822-V04-32-page3.txt: [('Mes', 'siah')] WMH19060905-V04-34-page3.txt: [('Confer', 'ence')] WMH19060912-V04-35-page3.txt: [('to', 're')] WMH19060926-V04-37-page2.txt: [('wei', 'ght')] WMH19061017-V04-40-page1.txt: [('EAST', 'ERN')] WMH19061017-V04-40-page3.txt: [('es', 'sential')] WMH19061031-V04-42-page3.txt: [('de', 'cide')] WMH19061114-V04-44-page4.txt: [('A', 'ncient'), ('to', 'ro')] WMH19061205-V04-47-page2.txt: [('wrest', 'lers')] WMH19061205-V04-47-page4.txt: [('D', 'ecember')] WMH19070102-V05-01-page1.txt: [('g', 'oo')] WMH19070102-V05-01-page3.txt: [('t', 'hrough')] WMH19070109-V05-02-page1.txt: [('A', 'KA')] WMH19070109-V05-02-page2.txt: [('prom', 'ised')] WMH19070116-V05-03-page3.txt: [('founda', 'tion')] WMH19070306-V05-10-page3.txt: [('thi', 'nking')] WMH19070327-V05-13-page3.txt: [('consci', 'entious')] WMH19070417-V05-16-page3.txt: [('u', 'nfailing')] WMH19070424-V05-17-page1.txt: [('peo', 'ple')] WMH19070424-V05-17-page2.txt: [('IN', 'STRUCTOR')] WMH19070515-V05-20-page2.txt: [('di', 'fficulties')] WMH19070605-V05-23-page2.txt: [('ha', 'th')] WMH19070814-V05-33-page4.txt: [('Mon', 'tcalm')] WMH19070828-V05-34-page3.txt: [('rat', 'es')] WMH19070911-V05-36-page1.txt: [('d', 'ay'), ('to', 'ri')] WMH19071002-V05-39-page1.txt: [('M', 'ichigan')] WMH19071016-V05-41-page4.txt: [('go', 'od')] WMH19071106-V05-44-page1.txt: [('SUPER', 'IOR')] WMH19071106-V05-44-page2.txt: [('the', 'ist')] WMH19071106-V05-44-page4.txt: [('t', 'Aro')] WMH19071120-V05-46-page1.txt: [('ans', 'wer')] WMH19071120-V05-46-page3.txt: [('w', 'hich')] WMH19071211-V05-49-page2.txt: [('t', 'oo')] WMH19071218-V05-50-page2.txt: [('a', 'nd')] WMH19071218-V05-50-page3.txt: [('recom', 'mendation')] WMH19080101-V06-01-page4.txt: [('the', 'ft')] WMH19080115-V06-03-page3.txt: [('distri', 'bution')] WMH19080122-V06-04-page3.txt: [('r', 'eligious')] WMH19080129-V06-05-page4.txt: [('black', 'smithing')] WMH19080212-V06-07-page2.txt: [('o', 'ro')] WMH19080219-V06-08-page4.txt: [('pres', 'ent')] WMH19080304-V06-10-page2.txt: [('period', 'icals')] WMH19080311-V06-11-page1.txt: [('famil', 'iar')] WMH19080401-V06-14-page2.txt: [('Sec', 'retaries')] WMH19080401-V06-14-page3.txt: [('an', 'ther')] WMH19080422-V06-17-page3.txt: [('for', 'th')] WMH19080429-V06-18-page4.txt: [('Stu', "dents'")] WMH19080506-V06-19-page1.txt: [('con', 'vention')] WMH19080513-V06-20-page1.txt: [('con', 'tributions'), ('mo', 'rA')] WMH19080513-V06-20-page2.txt: [('num', 'ber')] WMH19080520-V06-21-page1.txt: [('e', 'rr'), ('A', 'MO')] WMH19080520-V06-21-page2.txt: [('den', 'ial')] WMH19080520-V06-21-page3.txt: [('o', 'ro')] WMH19080603-V06-23-page1.txt: [('M', 'ICHIGAN')] WMH19080610-V06-24-page4.txt: [('Adv', 'ent')] WMH19080708-V06-28-page1.txt: [('k', 'AW')] WMH19080708-V06-28-page3.txt: [('Bap', 'tist'), ('in', 'vited')] WMH19080715-V06-29-page2.txt: [('A', 'dventists')] WMH19080722-V06-30-page2.txt: [('LIB', 'ERTY'), ('John', 'ston')] WMH19080805-V06-32-page2.txt: [('near', 'ly')] WMH19080826-V06-34-page3.txt: [('class', 'es')] WMH19080909-V06-36-page2.txt: [('auspi', 'ces')] WMH19080916-V06-37-page3.txt: [('shad', 'ow')] WMH19080916-V06-37-page4.txt: [('and', 're')] WMH19080923-V06-38-page1.txt: [('t', 'IE')] WMH19080923-V06-38-page4.txt: [('Pro', 'fessor')] WMH19080930-V06-39-page1.txt: [('abo', 'ut')] WMH19081007-V06-40-page4.txt: [('min', 'isters')] WMH19081014-V06-41-page1.txt: [('i', 'ri')] WMH19081014-V06-41-page2.txt: [('o', 'ro')] WMH19081028-V06-43-page1.txt: [('gos', 'pel')] WMH19081104-V06-44-page4.txt: [('a', 're')] WMH19081111-V06-45-page5.txt: [('He', 'ra')]
In [48]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/correction6 Average verified rate: 0.977728102578558 Average of error rates: 0.024010546500479388 Total token count: 915279
In [49]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[49]:
[('m', 1746), ('w', 1502), ('g', 1438), ('e', 1316), ('d', 1278), ('r', 684), ('n', 641), ("'", 490), ('f', 446), ('t', 378), ('th', 273), ('oo', 171), ('sabbathschool', 163), ('io', 115), ('mt', 108), ('k', 105), ('co', 98), ('ro', 94), ('wm', 83), ('numbess', 75), ('u', 70), ("canvassers'", 58), ('x', 46), ('horr', 39), ("the'", 38), ('rd', 33), ('blendon', 32), ('brower', 31), ('ex', 30), ('harnden', 30), ("f'd", 30), ('mchugh', 29), ('seventhday', 28), ('cleora', 27), ('nd', 26), ('q', 23), ('nunica', 23), ('sabbathschools', 23), ('-', 22), ('tion', 21), ('vowyla', 21), ('re', 21), ('fd', 20), ('psa', 20), ('z', 20), ('loth', 20), ('numbeps', 19), ('ioo', 19), ('drury', 18), ('hoffstra', 18)]
Review Remaining Errors¶
In [50]:
reports.docs_with_high_error_rate(summary)
Out[50]:
[('WMH19081111-V06-45-page5.txt', 0.455)]
In [52]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.2]
utilities.open_original_docs(doc_keys, directories['cycle'])
Opened files: WMH19081111-V06-45-page5.txt
High error document is handwritten.
In [54]:
reports.long_errors(errors_summary, min_length=15)
Out[54]:
(['heaven-appointed', 'tionofourownpeople', 'niialtioiaaavaliwailio', 'unimpressionable', 'enjoyable-service', 'carefully-arranged', 'disconnectedfrom', 'csuperintendents', 'sabbath-meetings', 'distinguishingbetween', 'great-grandchildren', 'iqiiiidiiiniinii', 'stick-to-it-iveness', 'juippliympamtuuju', 'ponderousdocument', 'influentiarwriters', 'blackstring-around-the-neck', 'self-examination', 'congregegational', 'securingappointments', 'dwellingconveniences', 'all--sabbath-school', 'thought-producing', 'danish-norwegian', 'sleepfsfuicffeicse', 'counter-campaign', 'stoop-shouldered', 'disfeliowshipped', 'desire--expression', 'soul-encouraging', 'abundantresources', 'hethatreapethgatherethfruituntolifeeternal', 'instrumentterial', 'fruituntolifeeternal', 'ceremonieswholly', 'responstbilities', 'commandment-keeping', 'encouragingteachers', 'bookslavebeenhandledmostly', 'relief-of-schools', 'great-responsibility', 'trailting-school', 'irrilirrimiiimiptimpiiir', 'eleven-twentieths', 'self-gratification', "under'compulsion", 'twatmanypersonswouldgivea', 'christianindividuals', 'southernlllinois', 'comparativelysmall', 'weddingring-i-ub', 'seventy-thousand', 'arithmetic--decimal', 'reapetagattiereth', "government--smith's", "fifteen-minutes'", 'solemnmea-ningt---a-', 'northmichigancamp-meetingat', 'receiptrnargaret', 'concerningemmarrual', 'self-satisfaction', 'wanted--assurance', 'unproductiveness', 'never-tobe-omitted', 'the-sabbath-school', 'imiminiiiiimicermin', 'over-development', 'overly-sensitive', 'stumbling-blocks', 'self-opinionated', "hartwelljn'behalf", 'onion-in-the-pocket', 'withindifference', 'selfconsciousness', 'annakemstraannddolivercrumb', 'ntalliscomparativelyquiet', 'drills--penmanship', 'cut-price-combination', 'over-enthusiastic', "iiiiiiiiiniiffffr'", 'iiiiiiviiiiiimunimmi'], 15)
Correction 7 -- Remove Long Error Tokens¶
In [57]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = cycle
cycle = "correction7"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
replacements = []
replacements.append(clean.check_for_repeating_characters(tokens, "i|I"))
replacements = [item for sublist in replacements for item in sublist]
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
WMH19050104-V03-01-page4.txt: [('IIIIIIVIIIIIIMUNIMMI', ' ')] WMH19060321-V04-12-page1.txt: [("IIIIIIIIINIIffffr'", ' ')] WMH19080729-V06-31-page1.txt: [('IQIIIIdIIINIINII', ' ')]
Correction 8 -- Separate Squashed Words¶
In [59]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log
prev = cycle
cycle = "correction8"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
verified_tokens = []
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)
tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]
sorted_list_of_words = list(words_sorted_short['token'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = utilities.strip_punct(content)
tokens = utilities.tokenize_text(text)
wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)
replacements = []
for token in tokens:
if not token.lower() in spelling_dictionary:
if len(token) > 17:
if re.search(r"[\-\-\'\"]", token):
pass
else:
split_string = clean.infer_spaces(token, wordcost, maxword)
list_split_string = split_string.split()
if clean.verify_split_string(list_split_string, spelling_dictionary):
replacements.append((token, split_string))
else:
pass
else:
pass
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
WMH19030603-V01-22-page3.txt: [('distinguishingbetween', 'distinguish ing between')] WMH19040106-V02-02-page1.txt: [('Christianindividuals', 'Christian individuals')] WMH19040928-V02-35-page2.txt: [('NIIALTIOIAAAVALIWAILIO', 'N I I A L T I O I A A A V A L I W A I L I O')] WMH19050201-V03-04-page2.txt: [('irrilirrIMIIIMIPTIMPIIIR', 'ir r i l i r r I M I I I M I P T I M P I I I R')] WMH19060117-V04-03-page1.txt: [('HETHATREAPETHGATHERETHFRUITUNTOLIFEETERNAL', 'HE THAT REAPETH GATHERETH FRUIT UNTO LIFE ETERNAL')] WMH19060718-V04-28-page2.txt: [('encouragingteachers', 'encouraging teachers')] WMH19060725-V04-29-page1.txt: [('FRUITUNTOLIFEETERNAL', 'FRUIT UNTO LIFE ETERNAL')] WMH19060919-V04-36-page1.txt: [('comparativelysmall', 'comparatively small')] WMH19080408-V06-15-page1.txt: [('bookslavebeenhandledmostly', 'book slave been handled mostly')] WMH19080722-V06-30-page2.txt: [('securingappointments', 'securing appointments')] WMH19080909-V06-36-page1.txt: [('HETHATREAPETHGATHERETHFRUITUNTOLIFEETERNAL', 'HE THAT REAPETH GATHERETH FRUIT UNTO LIFE ETERNAL')]
In [62]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/WMH/correction8 Average verified rate: 0.9777330102496761 Average of error rates: 0.024005752636625122 Total token count: 915346
In [63]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[63]:
[('m', 1749), ('w', 1503), ('g', 1438), ('e', 1316), ('d', 1278), ('r', 688), ('n', 642), ("'", 490), ('f', 446), ('t', 380), ('th', 273), ('oo', 171), ('sabbathschool', 163), ('io', 115), ('mt', 108), ('k', 105), ('co', 98), ('ro', 94), ('wm', 83), ('numbess', 75), ('u', 70), ("canvassers'", 58), ('x', 46), ('horr', 39), ("the'", 38), ('rd', 33), ('blendon', 32), ('brower', 31), ('ex', 30), ('harnden', 30), ("f'd", 30), ('mchugh', 29), ('seventhday', 28), ('cleora', 27), ('nd', 26), ('q', 23), ('nunica', 23), ('sabbathschools', 23), ('-', 22), ('tion', 21), ('vowyla', 21), ('re', 21), ('fd', 20), ('psa', 20), ('z', 20), ('loth', 20), ('numbeps', 19), ('ioo', 19), ('drury', 18), ('hoffstra', 18)]
In [ ]: