TMM-OCR-Evaluation-and-Correction
In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt",
"2016-12-07-SDA-place-names.txt",
"2016-12-08-SDA-Vocabulary.txt",
"2017-01-03-place-names.txt",
"2017-02-14-Base-Word-List-SCOWL&KJV.txt",
"2017-02-14-Roman-Numerals.txt",
"2017-03-01-Additional-Approved-Words.txt"
]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "TMM"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)
Baseline¶
In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/baseline Average verified rate: 0.9620184818421186 Average of error rates: 0.048324675324675326 Total token count: 870476
In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 30 )
Out[11]:
[('-', 1111),
("'", 540),
('¥', 474),
('w', 467),
('e', 451),
('ñ', 412),
(')', 345),
('m', 331),
('n', 290),
('r', 290),
('t', 285),
('d', 273),
('con-', 259),
('f', 239),
('g', 237),
('re-', 221),
('tion', 198),
('mis-', 160),
('in-', 147),
('*', 145),
('(', 128),
('com-', 117),
('ñthe', 107),
('th', 106),
('be-', 105),
('¡', 91),
('de-', 87),
('sionary', 87),
('mission-', 82),
('ment', 78),
('ex-', 76),
('ary', 70),
('tions', 69),
('co', 68),
('x', 67),
('pa', 63),
('k', 62),
('en-', 61),
('u', 61),
('[illustrated]', 60),
('+', 60),
('per-', 59),
('pro-', 58),
('/', 58),
('z', 56),
('dis-', 53),
('ple', 51),
('peo-', 49),
('(the', 49),
('pre-', 48),
('ers', 47),
('an-', 46),
('un-', 46),
('ad-', 43),
('ence', 42),
('ñwe', 41),
('(illustrated)', 41),
('io', 41),
('oc', 40),
('_', 40),
('ber', 39),
('inter-', 39),
('ã', 39),
('ac-', 38),
('for-', 38),
('to-', 37),
('meet-', 37),
('im-', 36),
('thou-', 35),
('can-', 34),
(']', 33),
('ña', 32),
('oo', 32),
('mes-', 31),
('al-', 31)]
Check Special Character Use¶
In [12]:
reports.tokens_with_special_characters(errors_summary)
Out[12]:
[('¥', 474),
('ñ', 412),
(')', 345),
('*', 145),
('(', 128),
('ñthe', 107),
('¡', 91),
('+', 60),
('[illustrated]', 60),
('/', 58),
('(the', 49),
('ñwe', 41),
('(illustrated)', 41),
('_', 40),
('ã', 39),
(']', 33),
('ña', 32),
('ñall', 29),
('`', 27),
('(b)', 26),
('ñin', 26),
('(c)', 26),
('(a)', 26),
('(d)', 23),
('=', 23),
('ñit', 22),
('ñsubscriptions', 22),
('%', 22),
('ñelder', 21),
('departmentñ', 21),
('(a', 21),
('(march)', 20),
('ô', 19),
('¥¥', 19),
('(to', 19),
('ñan', 18),
('(see', 17),
('(for', 17),
('(april)', 16),
('(which', 16),
('ñand', 16),
('(e)', 16),
('(may)', 15),
('ñone', 15),
('\\', 15),
('(poem)', 14),
('£', 14),
('(and', 14),
('(including', 13),
('(as', 13),
('(in', 13),
('[see', 13),
('ñno', 12),
('(f)', 12),
('¥¥¥', 12),
('ñto', 12),
('holland)', 12),
('ñid', 12),
('(incorporated', 12),
('(concluded', 11),
('ñtest', 11),
('>', 11),
('(or', 11),
('[poem]', 10),
('(continued', 10),
('+contents+', 10),
('ñmrs', 10),
('ñjune', 9),
('(acts', 8),
('(j', 8),
('ñjuly', 8),
('ñbrother', 8),
('ñi', 8),
('(january)', 8),
('¥¥¥¥¥¥¥¥', 7),
('ñhe', 7),
('(fourth', 7),
('[second', 7),
('ñfrom', 7),
('(june)', 7),
('christianityñits', 7),
('ñour', 7),
('ñmay', 7),
('<', 7),
('-¥', 7),
('ñspecial', 7),
('ñfebruary', 6),
('ñjohn', 6),
('(august)', 6),
('(july)', 6),
('ñwhen', 6),
('reading)', 6),
('[fourth', 6),
('ñfacts', 6),
('ñjust', 6),
('ñapril', 6),
('(i', 6),
('reading]', 6),
('(december', 6),
('ñdr', 6),
('the¥', 6),
('ñmarch', 5),
("'¥", 5),
('(if', 5),
('ñnot', 5),
('û', 5),
('(dan', 5),
('ñsince', 5),
('\ufeff', 5),
('%x', 5),
('ñseptember', 5),
('ñhome', 5),
('them)', 5),
('[', 5),
('world)', 5),
('++', 5),
('readingñsabbath', 5),
('it)', 5),
('(there', 5),
('(september)', 5),
('land)', 5),
('(rom', 5),
('~~', 5),
('(i)', 5),
('(from', 5),
('ñthat', 5),
('(g)', 4),
('(april', 4),
('cenñ', 4),
('[the', 4),
('{', 4),
('f\x90te-day', 4),
('ñrev', 4),
('ñbecause', 4),
('`i', 4),
('(rev', 4),
('ñmr', 4),
('(march', 4),
('(work', 4),
('(with', 4),
('school)', 4),
('ñdecember', 4),
('service]', 4),
('people)', 4),
('(of', 4),
('(verse', 4),
('ñon', 4),
('(oregon)', 4),
('darjeelingñabove', 4),
('¥¥¥¥', 4),
('ñheb', 4),
("¥'", 4),
('ñprofessor', 4),
('river)', 4),
('ñabout', 4),
('ñmatt', 4),
('ñduring', 4),
('ñjanuary', 4),
('(isa', 4),
('(february', 4),
('weeks)', 3),
('¥¥¥¥¥', 3),
('(building', 3),
('natives)', 3),
('_-', 3),
('mission)', 3),
('workñit', 3),
('(nearly', 3),
('lapps)', 3),
('year)', 3),
('¥v', 3),
('^', 3),
('ñevery', 3),
('a¥nd', 3),
('o)', 3),
('(that', 3),
('workñ', 3),
('ñmissionary', 3),
('(r', 3),
('water)', 3),
('))', 3),
('called)', 3),
('*the', 3),
('organ)', 3),
('-(', 3),
('}', 3),
('cooked)', 3),
('(it', 3),
('i)', 3),
('ñsel', 3),
('oolooberiaña', 3),
('[*', 3),
('ñas', 3),
('themñand', 3),
('ñoctober', 3),
('ñlast', 3),
('(called', 3),
('fund)', 3),
('¥-¥', 3),
('¥i', 3),
('may)', 3),
('(not', 3),
('ñisa', 3),
('(one', 3),
('ñs', 3),
('ñat', 3),
('ñthis', 3),
('(denmark', 3),
('ñpart', 3),
('exercise]', 3),
('(december)', 3),
('watchwordñ', 3),
('ñyes', 3),
('ñthere', 3),
('ñfor', 3),
('(john', 3),
('¥t', 3),
('are)', 3),
('±', 3),
('[illustrated', 3),
('brazilñ', 3),
('(kansas)', 2),
('(ind', 2),
('(most', 2),
('✓', 2),
('(meaning', 2),
('ñtwo', 2),
('r)', 2),
('ñgospel', 2),
('package)', 2),
('//', 2),
('ñspurgeon', 2),
('ñnew', 2),
('(six', 2),
('ago)', 2),
('ñgod', 2),
("+'", 2),
('departme\\t', 2),
('oneñthat', 2),
('ñselected', 2),
('s/', 2),
('_a', 2),
('ñso', 2),
('here)', 2),
('(though', 2),
('ö', 2),
('g%', 2),
('¥¥¥¥¥¥¥¥¥¥', 2),
('(about', 2),
('(ad', 2),
('(mark', 2),
('ruary)', 2),
('humanityñto', 2),
('spirit)', 2),
('¥the', 2),
('stãpittsburg', 2),
("¡'", 2),
('(f', 2),
('daughter)', 2),
('f\x90te', 2),
('missio\\ary', 2),
('(religious', 2),
('hallña', 2),
('only)', 2),
('(all', 2),
('ñyea', 2),
('ñletters', 2),
('/t', 2),
('¥s', 2),
('ñseveral', 2),
('christñthe', 2),
('*ghest', 2),
('(dutch', 2),
('troubleñwhether', 2),
(')v', 2),
('`<', 2),
('ñother', 2),
('day)', 2),
('ñbut', 2),
('¥%', 2),
('times)', 2),
('(t', 2),
('ñr', 2),
('children)', 2),
('raceñthe', 2),
('(february)', 2),
('ioo¡', 2),
('cñtwin', 2),
('(christ)', 2),
('worldñis', 2),
('(trinidad)', 2),
('ñword', 2),
('week)', 2),
('(in-', 2),
('ñthey', 2),
('ñdesire', 2),
('ç', 2),
('colony)', 2),
('plata)', 2),
('(margin)', 2),
('time)', 2),
('gospelñthe', 2),
('the_', 2),
('(now', 2),
('church)', 2),
('¥new', 2),
('old)', 2),
('ñsome', 2),
('#', 2),
('ñwith', 2),
('english)', 2),
('ñchristian', 2),
('(but', 2),
('(southern)', 2),
('(denmark)', 2),
('t*', 2),
('(on', 2),
('states)', 2),
('ñof', 2),
('town)', 2),
('caf\x8es', 2),
('¥e', 2),
('churchñthe', 2),
('cut)', 2),
('worldñto', 2),
('nomñthe', 2),
('ig*', 2),
('(they', 2),
('ñif', 2),
('(miss', 2),
('ñpsalm', 2),
('april)', 2),
('(feb-', 2),
('exampleñthe', 2),
('ñtee', 2),
('ñerratum', 2),
('verseñ', 2),
('magazine)', 2),
('(january', 2),
('(holy', 2),
('¥well', 2),
('one¥', 2),
('¥he', 2),
('specialñthe', 2),
('fool)', 2),
('peopleñi', 2),
('(concluded)', 2),
('ñphillips', 2),
('feet)', 2),
('ñby', 2),
('(signs', 2),
('ý', 2),
('ñstudent', 2),
('z¥', 2),
('c¥', 2),
('ñis', 2),
('a¡', 2),
('[in', 2),
('ñspiritual', 2),
('parts)', 2),
('citiesñand', 2),
('(verses', 2),
('allñ', 2),
('beñthe', 2),
('-`', 2),
('e¥', 2),
('a¥re', 2),
('(coolies)', 2),
('jews)', 2),
('(an', 2),
('(sometimes', 2),
('ñhis', 2),
('ñluke', 2),
('i¥', 2),
('bay)', 2),
('ñdo', 2),
('cheetstãphiladelphia', 2),
('(generally', 2),
('ñout', 2),
('life)', 2),
('themñthe', 2),
('(thoroughly', 2),
(')+', 2),
('erectedñone', 2),
('/-', 2),
('do)', 2),
('our¥', 2),
('(light', 2),
('(revelation', 2),
('godñnot', 2),
('(may', 2),
('[a', 2),
('(gal', 2),
('sayñand', 2),
('ñreview', 2),
('(later', 2),
('(michigan)', 2),
('**', 2),
('days)', 2),
('manña', 2),
('map)', 2),
('man)', 2),
('mallettñdear', 2),
('***', 2),
('``', 2),
('ñmy', 2),
('(god)', 2),
('ary)', 2),
('(alabama)', 2),
('[to', 2),
('(little', 2),
('(fig', 2),
('victoriañbut', 1),
('healthfully/', 1),
('thisñdishonesty', 1),
('(entre', 1),
('first)', 1),
('ñthree', 1),
('importa]', 1),
('partnerñnow', 1),
('_t_h/so', 1),
('ôof', 1),
('ñsucce', 1),
('churchñit', 1),
('menñthe', 1),
('body)', 1),
('(church)', 1),
('englandñto', 1),
('solitudeñthe', 1),
('(yang-tse', 1),
('f¥', 1),
('restñfor', 1),
('presentñperhaps', 1),
('preparationsñnot', 1),
('thingñfor', 1),
('ñaugust', 1),
('examination)', 1),
('possessionsñall', 1),
('*t', 1),
('societyñ', 1),
('yetñis', 1),
('dollarsñnine', 1),
("r'r%", 1),
('[orang', 1),
('kittsñthey', 1),
('text=bookñnovember', 1),
('(leap', 1),
('_enjoys', 1),
('/l', 1),
('millionsñone-third', 1),
('gatherings)', 1),
('cornñmealiesñis', 1),
('v/', 1),
('enciesñgrand', 1),
('power¥', 1),
('weaknessñ', 1),
('(thena', 1),
('winterñall', 1),
('ñlet', 1),
('myselfñduring', 1),
('on(', 1),
('hregardingv/', 1),
('floorsñthough', 1),
('first¥', 1),
('vationñhis', 1),
('encouraged)', 1),
('monstersñthe', 1),
('tonñunexcelled', 1),
('(wakenaam)', 1),
("tea'ã'is", 1),
('usñsend', 1),
('ôtis', 1),
('¥a', 1),
('worldñextends', 1),
('ñmost', 1),
('gu`', 1),
('vaticanñthe', 1),
('(cow', 1),
('*since', 1),
('argentina_', 1),
('ñg', 1),
('lotñit', 1),
('(j)', 1),
('countryñi', 1),
('slavesñcaptives', 1),
('stampsñamounting', 1),
('a#', 1),
('images)', 1),
('roomsñone', 1),
('messageñcaptain', 1),
('(sabbath-', 1),
('bondsñthese', 1),
('lakes)', 1),
('proml_tly', 1),
('saleñor', 1),
('continued)', 1),
('sister¥', 1),
('(helsingfors)', 1),
('(local', 1),
('about¥', 1),
('miiiiim=', 1),
('slightñfrom', 1),
('christiansñwe', 1),
('islesñst', 1),
('king¥', 1),
('w(', 1),
('(lao-tsze', 1),
('amo\\g', 1),
('(o', 1),
('toolñso', 1),
('fraternityñwhen', 1),
('messageña', 1),
('electro=hydropathic', 1),
('womenñcome', 1),
('(jamaica', 1),
('feverñ', 1),
('floor)', 1),
('demandñability', 1),
('knowledgeñthe', 1),
('studyñthe', 1),
('dayñso', 1),
('that)', 1),
('benedictionñelder', 1),
('ministersñall', 1),
('philippinesñbishop', 1),
('rageñall', 1),
('ãit', 1),
('familyñhe', 1),
('ñent', 1),
('continentñafrica', 1),
('this¥', 1),
('libertyñpolit-', 1),
('\\j', 1),
('doctorñassistant', 1),
('(adventist', 1),
('tal)', 1),
('sionaries)', 1),
('`voorlooper', 1),
('sideñand', 1),
('ñseven', 1),
('ñdifferent', 1),
('j¥', 1),
('actsñin', 1),
('ñmaybe', 1),
("'(", 1),
('c)', 1),
('ñsowing', 1),
('`there', 1),
('ground)', 1),
('letterñtwo', 1),
('biscuit)', 1),
('committeeñthat', 1),
('`kc', 1),
('classñthose', 1),
('nursesñwe', 1),
('ñpaul', 1),
('macheteñthe', 1),
('t`', 1),
('ñjesus', 1),
('a-*/**/¥', 1),
("^'cottiteri", 1),
('additionto_abont', 1),
('soulñ', 1),
('viewñabsolutely', 1),
('on/daniel', 1),
('/ft', 1),
('cattleñsecond', 1),
('(only', 1),
('fatallyñthe', 1),
('directionñgo', 1),
('arthur)', 1),
('instructionsñ', 1),
('dampña', 1),
('motherhoodñcannot', 1),
('biographyñthe', 1),
('¥+r', 1),
(')l', 1),
('goingñ', 1),
('itñthe', 1),
('insectsñthey', 1),
('orderñthe', 1),
('millionñrussia', 1),
('kilaueañprobably', 1),
('countriesñan', 1),
('(speaking', 1),
('*henever', 1),
('countriesñguaranteed', 1),
('superiorñ', 1),
('=mummy', 1),
('cityñand', 1),
('ã-_', 1),
('¥what', 1),
('message)', 1),
('a^or', 1),
('breadfruitña', 1),
('encouragingñbut', 1),
('fieldsñit', 1),
('winterñalways', 1),
('crosses)', 1),
('mules)', 1),
('ñcanon', 1),
('againñ', 1),
('doneñwhen', 1),
('(at', 1),
('ancestorsñfor', 1),
('(continental', 1),
('boundñby', 1),
("-%'", 1),
('¥interest', 1),
('%-', 1),
('ñanoust', 1),
('¥professor', 1),
('destructionñbecause', 1),
('paper*', 1),
('ñplainly', 1),
('goodñwe', 1),
('australiañstellenbosch', 1),
('the/', 1),
('believers)', 1),
('fel¥', 1),
('car_', 1),
('slavesñslaves', 1),
('kingña', 1),
('`commanded', 1),
('billowsñall', 1),
('-/', 1),
('himñmay', 1),
('(apartment', 1),
('ct)', 1),
('agoutiña', 1),
('(fields)', 1),
('margin)', 1),
('numeralsñthe', 1),
('tiv(ptilst', 1),
('indiañ', 1),
('alaska)', 1),
('lazmig[', 1),
('inches=', 1),
('timeñthat', 1),
('(phil', 1),
('¥but', 1),
('eatñalthough', 1),
('missionary`', 1),
('hzinû', 1),
('(col', 1),
('__', 1),
('himselfñwithout', 1),
('[food]', 1),
('racesñkafir', 1),
('climate)', 1),
('crocodile)', 1),
('ñlaces', 1),
('mapsñno', 1),
('ourselvesñhere', 1),
('`lo', 1),
('dutyñthat', 1),
('weekñdecember', 1),
('stateñbolivia', 1),
('laile¥city', 1),
('exceptions)', 1),
('nameaa„ss❑', 1),
('colorsñthey', 1),
('(caravansary)', 1),
('marriageñher', 1),
('harborñsaid', 1),
('writeñeven', 1),
('¡s', 1),
('months_', 1),
('i¥¥', 1),
('thought)', 1),
('malesñwere', 1),
('w(`', 1),
('oppositionñwere', 1),
('-an/', 1),
('(mich', 1),
('patonñthat', 1),
('heardñsublimer', 1),
('(when', 1),
('ñhaving', 1),
('augustñin', 1),
('formerñthey', 1),
('brownñ', 1),
('hospitableñwilling', 1),
('soldiersñwhat', 1),
('actlt`', 1),
('\\i', 1),
('(kwi)', 1),
('spainñlonged', 1),
('baptizedñone', 1),
('¥sasnoh', 1),
('doneñ', 1),
('^ids', 1),
('seekñgo', 1),
("_masse'", 1),
('missionaryñat', 1),
('asiañtheir', 1),
('familyñthe', 1),
('carriage¥road', 1),
('(after', 1),
('earn=', 1),
('ôô`', 1),
('(freedom', 1),
('tk%', 1),
('certainñ', 1),
('selfñof', 1),
('(very', 1),
('obi-women)', 1),
('iiiiii=viii', 1),
('(their', 1),
('ñeducation', 1),
('(servants)', 1),
('ñbooker', 1),
('text-bookñ', 1),
('ñsmith', 1),
('wellñtime', 1),
('agesñi', 1),
('hulañperformed', 1),
('``yellow', 1),
('<¥', 1),
('countriesñfrance', 1),
('`and', 1),
('cruzñaside', 1),
('ropeñthe', 1),
('especiallyñand', 1),
('groundñperhaps', 1),
("ã'it", 1),
('spanish)', 1),
('*igit', 1),
('neighborsñone', 1),
('ic)', 1),
('theeñpray', 1),
('saved)', 1),
('¡god', 1),
('peaksñpopocatepetl', 1),
('ill_', 1),
('ñtestimonies', 1),
('charcoalñand', 1),
("(')", 1),
('ñgermany', 1),
('policyñhe', 1),
('rabbitñsupposing', 1),
('philip>', 1),
("ñkerr's", 1),
('weekñ', 1),
('possessionñwhere', 1),
('ñten', 1),
('texasñthe', 1),
('¥every', 1),
('desireñright', 1),
('loveñi', 1),
('%¥', 1),
('othersña', 1),
('(naini', 1),
('lifeñunto', 1),
('badñas', 1),
('republic)', 1),
('¥of', 1),
('macheteña', 1),
('(moravian)', 1),
('handsñmore', 1),
('papersñcopies', 1),
('ñdoes', 1),
('usñso', 1),
('fieldsñi', 1),
('stationñthe', 1),
('(mule-drivers)', 1),
('ñjanuanv', 1),
('styleñby', 1),
('christñit', 1),
('m)', 1),
('*presenting', 1),
('planñ', 1),
('babylon)', 1),
('ñmarca', 1),
('tokenñof', 1),
('loveñdie', 1),
('falls)', 1),
('antilles)', 1),
('loebsack)', 1),
('trvtk`t', 1),
('ñjoshua', 1),
('comeñ', 1),
('islandsñ', 1),
('stãboston', 1),
('levuñ', 1),
('¥chicago', 1),
('argentineñwill', 1),
('faithñnot', 1),
('ãtoward', 1),
('farmsñonly', 1),
('california¥', 1),
('nationsñall', 1),
('accommodated)', 1),
('guageñ', 1),
('fast-daysñdays', 1),
('someñthey', 1),
('bationñthe', 1),
('has_not', 1),
('chinañeven', 1),
('(d', 1),
('understandñfor', 1),
('complainingñonly', 1),
('-_-', 1),
('mail)', 1),
('creatureñ', 1),
("'lgl`", 1),
('mother-in-lawñthere', 1),
('¥be', 1),
('part)', 1),
('(zech', 1),
('so/apper', 1),
('«iay', 1),
('quartñmuch', 1),
('(ex', 1),
('ñtan', 1),
('enoughñto', 1),
('a)', 1),
('strangersñbut', 1),
('c+p', 1),
("ta'*", 1),
('destinationñcaravellasñinquire', 1),
('ñwomen', 1),
('worshipñsun', 1),
('iã', 1),
('liveñthey', 1),
('log)', 1),
('mexico)', 1),
('necessaries_', 1),
('olanchoñsavannas', 1),
('matabelelandñ', 1),
('soft`', 1),
('hong()', 1),
('tub*', 1),
('ñmark', 1),
('prisonña', 1),
('companyñ', 1),
('\\ad-', 1),
('chineseñhigh', 1),
('historyñso', 1),
('livingstoneñthe', 1),
('f`', 1),
('kv*mk', 1),
('=-¥', 1),
('classñthe', 1),
('*****', 1),
('coveringñonly', 1),
('=lead', 1),
('strugglesñthat', 1),
('countryñreceived', 1),
('enunciatedñtruths', 1),
('(mr', 1),
('descriptionñthe', 1),
('menñmen', 1),
('aspectñ', 1),
('destinationña', 1),
('rã', 1),
('torch)', 1),
('frontikl**-i', 1),
('thingsñand', 1),
('themñi', 1),
('t)', 1),
('four¥', 1),
('exerciseñnovember', 1),
('placeñtreating', 1),
('usñthat', 1),
('understand)', 1),
('tune)', 1),
('chinese¥', 1),
('salvadorñthe', 1),
('—segari', 1),
('peopleñthere', 1),
('wayñ', 1),
('groundñthat', 1),
('¤elf-governing', 1),
('doorsñthe', 1),
('monthly)', 1),
('_thee', 1),
('beñ', 1),
('(three', 1),
('beliefsñsome', 1),
('furnitureñ', 1),
('gardenñplaces', 1),
('(patience)', 1),
('yards)', 1),
('expression)', 1),
('godñeven', 1),
('denseñ', 1),
('whichñkusaie', 1),
('kindsñand', 1),
('yearly)', 1),
('truthñ', 1),
('coloradoñ', 1),
('ñoun', 1),
('mals)', 1),
('sir)', 1),
('aljna/-', 1),
('seañas', 1),
('gloomñthe', 1),
('itself)', 1),
('eaten)', 1),
('degradingñit', 1),
('*elder', 1),
('g¥', 1),
('ó', 1),
('christian)', 1),
('personal)', 1),
('medicineñand', 1),
('yearñever', 1),
('[prague]', 1),
('laborersñmr', 1),
("\\'i", 1),
('¥¥¥-¥', 1),
('worldñthe', 1),
('leftñwe', 1),
('illiterateñthe', 1),
('(satisfied)', 1),
('ir_', 1),
('v¥a', 1),
('switzerlandñhong', 1),
('jerusalemñthey', 1),
('neckñtonsilitis', 1),
('ñbefore', 1),
('uresñchanges', 1),
('ñeducational', 1),
('(n)', 1),
('(rum)', 1),
('to-o)', 1),
('creoles)', 1),
('blackñabout', 1),
('iû', 1),
('ñam', 1),
('fieldsñwhether', 1),
('lifô', 1),
('(his', 1),
('glad_', 1),
('(rome)', 1),
('floorñand', 1),
('¥in', 1),
('_that', 1),
('(unless', 1),
('aitutakians)', 1),
('pestñthe', 1),
('abroadñthe', 1),
('widowhoodñall', 1),
('switzerlandñbulu-', 1),
('(grave-', 1),
('restore¥', 1),
("(+'", 1),
('a_llc', 1),
('sliogunateñso', 1),
('laborer)', 1),
('wordñto', 1),
('(gospel', 1),
('viveritt/', 1),
('(almost', 1),
('-*', 1),
('(beneath', 1),
('ginñmore', 1),
('wasñjesus', 1),
("`surveying'", 1),
('(hot', 1),
('grythyttehedñnoted', 1),
('taotaisñ', 1),
('sectsñthe', 1),
('journeyñand', 1),
('exerciseñoctober', 1),
('a¥', 1),
('amphitheaterñto', 1),
('milesñmore', 1),
('(iowa)', 1),
('personsñnatives', 1),
('(joppa', 1),
('gospelñat', 1),
('(embraces)', 1),
('settlementñwhich', 1),
('voyageñ', 1),
('pôco', 1),
('operations¥', 1),
('ñreading', 1),
('/tis', 1),
('ñex-', 1),
('sceneryñand', 1),
('worldñif', 1),
('gehenna)', 1),
('natalñgeneva', 1),
('t#tnr=ligt', 1),
('steamerñthree', 1),
('groundñeven', 1),
('(some', 1),
('aristocraticñthe', 1),
('ñcyrus', 1),
('*isi', 1),
('valleyñ', 1),
('ginzañthe', 1),
('[when', 1),
('to-morrowñwhile', 1),
('door)', 1),
('-***', 1),
('truthñlearn', 1),
('journeyñto', 1),
...]
Correction 1 -- Normalize Characters¶
In [14]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
# Substitute for all other dashes
content = re.sub(r"—-—–‑", r"-", content)
# Substitute formatted apostrophe
content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
# Replace all special characters with a space (as these tend to occur at the end of lines)
content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction1 Average verified rate: 0.9689343941867684 Average of error rates: 0.03858441558441559 Total token count: 869740
In [19]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[19]:
[('-', 1170),
("'", 573),
('e', 479),
('w', 475),
('m', 337),
('t', 314),
('r', 307),
('d', 301),
('n', 295),
('f', 268),
('con-', 259),
('g', 250),
('re-', 222),
('tion', 198),
('mis-', 161),
('in-', 149),
('com-', 117),
('th', 109),
('be-', 105),
('de-', 87),
('sionary', 87),
('mission-', 82),
('ment', 78),
('ex-', 77),
('ary', 74),
('x', 72),
('co', 70),
('tions', 69),
('u', 63),
('pa', 63),
('k', 63),
('en-', 61),
('per-', 59),
('pro-', 58),
('z', 58),
('dis-', 53),
('ple', 51),
('peo-', 49),
('pre-', 48),
('ers', 47),
('un-', 46),
('an-', 46),
('ad-', 44),
('ence', 42),
('io', 42),
('oc', 40),
('ber', 40),
('inter-', 39),
('for-', 38),
('ac-', 38)]
Correction 2 -- Correct Line Endings¶
In [21]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [24]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction2 Average verified rate: 0.9817024929526814 Average of error rates: 0.02631636363636364 Total token count: 862030
In [25]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[25]:
[('-', 1157),
("'", 573),
('e', 479),
('w', 475),
('m', 337),
('t', 312),
('r', 305),
('d', 301),
('n', 295),
('f', 267),
('g', 250),
('th', 109),
('x', 72),
('co', 69),
('pa', 63),
('k', 63),
('u', 63),
('z', 58),
('io', 42),
('oc', 40),
('mis-', 39),
('oo', 33),
('cc', 29),
('sionary', 29),
('--', 28),
('money-order', 24),
("'the", 23),
('q', 21),
('al', 21),
('mt', 20),
('ary', 19),
('id', 19),
('spanish-speaking', 19),
('hausaland', 19),
("''", 19),
('stauffer', 19),
('ft', 18),
('mo', 18),
('zo', 18),
('basle', 18),
('re', 18),
('hasegawa', 17),
('couva', 17),
('kalaka', 17),
('-the', 17),
('sul', 17),
('okohira', 16),
('ro', 16),
('sabbathschool', 15),
('pp', 15)]
Correction 3 -- Remove Extra Dashes¶
In [27]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
replacements = []
for token in tokens:
if token[0] is "-":
replacements.append((token, token[1:]))
elif token[-1] is "-":
replacements.append((token, token[:-1]))
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980101-V10-01-page1.txt: [('Mis-', 'Mis')]
TMM18980101-V10-01-page12.txt: [('-one', 'one'), ('-brought', 'brought'), ('-their', 'their'), ('-worship', 'worship')]
TMM18980101-V10-01-page13.txt: [('-appears', 'appears')]
TMM18980101-V10-01-page14.txt: [('-was', 'was')]
TMM18980101-V10-01-page15.txt: [('-Baptists', 'Baptists')]
TMM18980101-V10-01-page23.txt: [('respond-', 'respond')]
TMM18980101-V10-01-page26.txt: [('Waterloo-', 'Waterloo'), ('-Jamaica.', 'Jamaica.'), ('-by', 'by')]
TMM18980101-V10-01-page28.txt: [('-this', 'this')]
TMM18980101-V10-01-page32.txt: [('-WE', 'WE')]
TMM18980101-V10-01-page4.txt: [('--a', '-a'), ('-', ''), ('-rse', 'rse'), ('-', '')]
TMM18980101-V10-01-page9.txt: [('ene-', 'ene')]
TMM18980201-V10-02-page11.txt: [('---', '--')]
TMM18980201-V10-02-page13.txt: [('-K', 'K'), ('-N', 'N'), ('AricuN-', 'AricuN'), ('-', ''), ('-', '')]
TMM18980201-V10-02-page14.txt: [('Anglo-', 'Anglo'), ('-too', 'too')]
TMM18980201-V10-02-page17.txt: [('-miles', 'miles'), ('op-', 'op')]
TMM18980201-V10-02-page22.txt: [('prom-', 'prom')]
TMM18980201-V10-02-page27.txt: [('-', '')]
TMM18980201-V10-02-page32.txt: [('-', '')]
TMM18980201-V10-02-page33.txt: [('CON-', 'CON'), ('-', '')]
TMM18980201-V10-02-page35.txt: [('liter-', 'liter')]
TMM18980201-V10-02-page37.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('.-', '.')]
TMM18980201-V10-02-page38.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM18980201-V10-02-page39.txt: [('SECRE-', 'SECRE'), ('Mts-', 'Mts')]
TMM18980201-V10-02-page6.txt: [('-', '')]
TMM18980201-V10-02-page7.txt: [('Nes-', 'Nes')]
TMM18980201-V10-02-page9.txt: [('-', ''), ('-', '')]
TMM18980301-V10-03-page12.txt: [('Gar-', 'Gar')]
TMM18980301-V10-03-page16.txt: [('-', '')]
TMM18980301-V10-03-page19.txt: [('-', '')]
TMM18980301-V10-03-page24.txt: [('Mis-', 'Mis')]
TMM18980301-V10-03-page25.txt: [('O-----', 'O----'), ('-reveladas', 'reveladas'), ('galar-', 'galar')]
TMM18980301-V10-03-page28.txt: [('com-', 'com')]
TMM18980301-V10-03-page31.txt: [('-be', 'be'), ('-work', 'work')]
TMM18980301-V10-03-page32.txt: [('---g.', '--g.'), ('-P.', 'P.'), ('-krka', 'krka')]
TMM18980301-V10-03-page37.txt: [('estab-', 'estab'), ('Jan-', 'Jan')]
TMM18980301-V10-03-page38.txt: [('-', '')]
TMM18980301-V10-03-page39.txt: [('Mis-', 'Mis'), ('Mis-', 'Mis'), ('-', '')]
TMM18980301-V10-03-page5.txt: [('-', '')]
TMM18980301-V10-03-page6.txt: [('C--', 'C-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('..-', '..'), ('-', ''), ('-', '')]
TMM18980401-V10-04-page15.txt: [('-', '')]
TMM18980401-V10-04-page17.txt: [('-', '')]
TMM18980401-V10-04-page26.txt: [('HISTOR-', 'HISTOR')]
TMM18980401-V10-04-page3.txt: [('-', '')]
TMM18980401-V10-04-page30.txt: [('-', '')]
TMM18980401-V10-04-page31.txt: [('-', '')]
TMM18980401-V10-04-page33.txt: [('-I', 'I')]
TMM18980401-V10-04-page38.txt: [('encourag-', 'encourag')]
TMM18980401-V10-04-page4.txt: [('-', ''), ('-c-', 'c-'), ('-', ''), ('s-', 's'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('e-', 'e'), ('-', ''), ('-.t.', '.t.'), ('-', ''), ('-', ''), ('-', ''), ('..-', '..'), ('-', '')]
TMM18980401-V10-04-page40.txt: [('--Did', '-Did')]
TMM18980401-V10-04-page6.txt: [('WORK-', 'WORK')]
TMM18980501-V10-05-page17.txt: [('-', '')]
TMM18980501-V10-05-page24.txt: [('-T.', 'T.')]
TMM18980501-V10-05-page25.txt: [('-', ''), ('-', '')]
TMM18980501-V10-05-page28.txt: [('-.', '.'), ('i-', 'i'), ('-d', 'd'), ('-', ''), ('.-', '.'), ('-', ''), ('-s-azppos', 's-azppos')]
TMM18980501-V10-05-page29.txt: [('-', '')]
TMM18980501-V10-05-page30.txt: [('-', '')]
TMM18980501-V10-05-page31.txt: [('"Teu-', '"Teu')]
TMM18980501-V10-05-page35.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', '')]
TMM18980501-V10-05-page36.txt: [('-', ''), ('.-', '.')]
TMM18980501-V10-05-page37.txt: [('has-', 'has')]
TMM18980501-V10-05-page38.txt: [('-THREE', 'THREE')]
TMM18980501-V10-05-page39.txt: [('re-', 're')]
TMM18980601-V10-06-page16.txt: [('-', ''), ('-', '')]
TMM18980601-V10-06-page20.txt: [('-------A--', '------A--'), ('-.', '.'), ('..-', '..'), ('--', '-'), ('--', '-'), ('--..', '-..'), ('-...t', '...t'), ('..-', '..'), ('-', ''), ("....-k'-", "....-k'"), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('K-', 'K'), ('-', ''), ('.-', '.'), ('..--', '..-'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-AN', 'AN'), ('-', ''), ('--', '-'), ('ir-', 'ir'), ('-', ''), ('--', '-'), ('-AI', 'AI'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('.--', '.-'), ('---', '--')]
TMM18980601-V10-06-page22.txt: [('lb.-', 'lb.')]
TMM18980601-V10-06-page23.txt: [('-JUNE', 'JUNE'), ('-', '')]
TMM18980601-V10-06-page24.txt: [('-', ''), ('-', '')]
TMM18980601-V10-06-page25.txt: [('-', ''), ('-', '')]
TMM18980601-V10-06-page26.txt: [('doing-', 'doing')]
TMM18980601-V10-06-page36.txt: [('-', ''), ('leav-', 'leav')]
TMM18980701-V10-07-page10.txt: [('-', '')]
TMM18980701-V10-07-page11.txt: [('-lying', 'lying')]
TMM18980701-V10-07-page12.txt: [('--How', '-How')]
TMM18980701-V10-07-page30.txt: [('Amsterdam-', 'Amsterdam')]
TMM18980701-V10-07-page36.txt: [('inter-', 'inter')]
TMM18980701-V10-07-page38.txt: [('-c', 'c'), ('..-', '..'), ('-', ''), ('-', '')]
TMM18980701-V10-07-page4.txt: [('num-', 'num'), ('-', '')]
TMM18980701-V10-07-page40.txt: [('MAG-', 'MAG')]
TMM18980701-V10-07-page42.txt: [('-', '')]
TMM18980701-V10-07-page6.txt: [('Young-', 'Young')]
TMM18980801-V10-08-page21.txt: [('Euro-', 'Euro')]
TMM18980801-V10-08-page24.txt: [('conse-', 'conse')]
TMM18980801-V10-08-page26.txt: [('MISSIONARY-', 'MISSIONARY')]
TMM18980801-V10-08-page31.txt: [('z-', 'z'), ('bountifully."-', 'bountifully."')]
TMM18980801-V10-08-page35.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('.-', '.'), ('.-', '.')]
TMM18980801-V10-08-page38.txt: [('MIS-', 'MIS'), ('MIS-', 'MIS')]
TMM18980901-V10-09-page13.txt: [('-in', 'in')]
TMM18980901-V10-09-page16.txt: [('house-', 'house')]
TMM18980901-V10-09-page31.txt: [('-refining', 'refining'), ('-', '')]
TMM18980901-V10-09-page35.txt: [('me-', 'me')]
TMM18980901-V10-09-page37.txt: [('tear-drops--', 'tear-drops-')]
TMM18980901-V10-09-page8.txt: [('-', '')]
TMM18980901-V10-09-page9.txt: [('-view', 'view')]
TMM18981001-V10-10-page11.txt: [('-and', 'and'), ('-Fiance', 'Fiance'), ('-', '')]
TMM18981001-V10-10-page16.txt: [('-would', 'would')]
TMM18981001-V10-10-page18.txt: [('-', '')]
TMM18981001-V10-10-page19.txt: [('-', '')]
TMM18981001-V10-10-page21.txt: [('-rented', 'rented'), ('-', '')]
TMM18981001-V10-10-page28.txt: [('-', ''), ('edu-', 'edu')]
TMM18981001-V10-10-page30.txt: [('-', '')]
TMM18981001-V10-10-page33.txt: [('-', '')]
TMM18981001-V10-10-page34.txt: [('-', ''), ('-', '')]
TMM18981001-V10-10-page37.txt: [('-', ''), ('-c', 'c')]
TMM18981001-V10-10-page38.txt: [('-', '')]
TMM18981101-V10-11-page12.txt: [('-', '')]
TMM18981101-V10-11-page17.txt: [('MIS-', 'MIS')]
TMM18981101-V10-11-page20.txt: [('cor-', 'cor')]
TMM18981101-V10-11-page25.txt: [('MAGA-', 'MAGA')]
TMM18981101-V10-11-page27.txt: [('fol-', 'fol')]
TMM18981101-V10-11-page29.txt: [('-rendering', 'rendering')]
TMM18981101-V10-11-page30.txt: [('-', ''), ('liter-', 'liter')]
TMM18981101-V10-11-page31.txt: [('-', '')]
TMM18981101-V10-11-page33.txt: [('-at', 'at')]
TMM18981101-V10-11-page34.txt: [('-', '')]
TMM18981101-V10-11-page35.txt: [('-', '')]
TMM18981101-V10-11-page36.txt: [('SOCI-', 'SOCI'), ('.-', '.'), ('MIS-', 'MIS'), ('QUAR-', 'QUAR')]
TMM18981101-V10-11-page37.txt: [('-', ''), ('MAGA-', 'MAGA')]
TMM18981101-V10-11-page38.txt: [('MAGA-', 'MAGA')]
TMM18981101-V10-11-page6.txt: [('great-', 'great')]
TMM18981101-V10-11-page7.txt: [('-', '')]
TMM18981201-V10-12-page13.txt: [('igno-', 'igno')]
TMM18981201-V10-12-page17.txt: [('interme-', 'interme')]
TMM18981201-V10-12-page19.txt: [('-little', 'little')]
TMM18981201-V10-12-page2.txt: [('-mighty', 'mighty'), ('op-', 'op')]
TMM18981201-V10-12-page23.txt: [('-her.', 'her.')]
TMM18981201-V10-12-page27.txt: [('-', '')]
TMM18981201-V10-12-page32.txt: [('-', '')]
TMM18981201-V10-12-page36.txt: [('Sab-', 'Sab')]
TMM18981201-V10-12-page4.txt: [('-', '')]
TMM18981201-V10-12-page41.txt: [('-', '')]
TMM18981201-V10-12-page43.txt: [('--Near', '-Near'), ('-THE', 'THE')]
TMM18981201-V10-12-page44.txt: [('Par-', 'Par')]
TMM18981201-V10-12-page45.txt: [('Character-', 'Character')]
TMM18981201-V10-12-page46.txt: [('Mission-', 'Mission')]
TMM18990101-V11-01-page12.txt: [('-teach', 'teach'), ('-standing', 'standing'), ('-the', 'the')]
TMM18990101-V11-01-page13.txt: [('-', '')]
TMM18990101-V11-01-page14.txt: [('-', ''), ("'-", "'"), ('oranges-and-', 'oranges-and'), ('-I', 'I'), ('-', ''), ('-of', 'of'), ('-mines', 'mines'), ('-', '')]
TMM18990101-V11-01-page17.txt: [('flower-', 'flower')]
TMM18990101-V11-01-page19.txt: [('-', '')]
TMM18990101-V11-01-page2.txt: [('THI-', 'THI')]
TMM18990101-V11-01-page25.txt: [('-', '')]
TMM18990101-V11-01-page27.txt: [('-the', 'the'), ('-people', 'people'), ('-their', 'their'), ('-of', 'of')]
TMM18990101-V11-01-page28.txt: [('-', '')]
TMM18990101-V11-01-page29.txt: [('DAR-ES-', 'DAR-ES'), ('MIS-', 'MIS')]
TMM18990101-V11-01-page31.txt: [('the-', 'the')]
TMM18990101-V11-01-page32.txt: [('-voyage.', 'voyage.')]
TMM18990101-V11-01-page36.txt: [('success-', 'success')]
TMM18990101-V11-01-page38.txt: [('whole-', 'whole')]
TMM18990101-V11-01-page44.txt: [('stop--', 'stop-')]
TMM18990101-V11-01-page45.txt: [('-', '')]
TMM18990101-V11-01-page47.txt: [('-', ''), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990101-V11-01-page48.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('repre-', 'repre'), ('-.', '.')]
TMM18990101-V11-01-page8.txt: [('-rainy', 'rainy')]
TMM18990101-V11-01-page9.txt: [('-', ''), ('Spanish-', 'Spanish')]
TMM18990201-V11-02-page1.txt: [('Guiana-', 'Guiana')]
TMM18990201-V11-02-page13.txt: [('-', '')]
TMM18990201-V11-02-page15.txt: [('-', '')]
TMM18990201-V11-02-page18.txt: [('un-', 'un')]
TMM18990201-V11-02-page23.txt: [('-', '')]
TMM18990201-V11-02-page25.txt: [('-', '')]
TMM18990201-V11-02-page28.txt: [('-', '')]
TMM18990201-V11-02-page29.txt: [('WORK-', 'WORK')]
TMM18990201-V11-02-page30.txt: [('-as', 'as'), ('-development', 'development'), ("-of'", "of'"), ('suf-', 'suf')]
TMM18990201-V11-02-page38.txt: [('-they', 'they')]
TMM18990201-V11-02-page41.txt: [('con-', 'con')]
TMM18990201-V11-02-page47.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18990201-V11-02-page48.txt: [('inquir-', 'inquir')]
TMM18990201-V11-02-page51.txt: [('-', '')]
TMM18990201-V11-02-page52.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', ''), ('.-', '.'), ('.-', '.'), ('-', ''), ('.-', '.'), ('-', ''), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('-', ''), ('r.-', 'r.'), ('.-', '.')]
TMM18990201-V11-02-page53.txt: [('other-', 'other')]
TMM18990201-V11-02-page54.txt: [('-', ''), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990201-V11-02-page55.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ("-'", "'"), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-P', 'P')]
TMM18990201-V11-02-page9.txt: [('-', '')]
TMM18990301-V11-03-page11.txt: [('for-', 'for'), ('-', '')]
TMM18990301-V11-03-page13.txt: [('-', ''), ("----'", "---'")]
TMM18990301-V11-03-page25.txt: [('-"Christian', '"Christian')]
TMM18990301-V11-03-page29.txt: [('HALE.-', 'HALE.')]
TMM18990301-V11-03-page30.txt: [('-still', 'still')]
TMM18990301-V11-03-page31.txt: [('HOFFMAN-', 'HOFFMAN')]
TMM18990301-V11-03-page37.txt: [('-', ''), ('con-', 'con')]
TMM18990301-V11-03-page38.txt: [('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990301-V11-03-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.')]
TMM18990301-V11-03-page40.txt: [('-', '')]
TMM18990301-V11-03-page7.txt: [('-Burrus', 'Burrus')]
TMM18990301-V11-03-page9.txt: [('frame-', 'frame')]
TMM18990401-V11-04-page10.txt: [('the-', 'the')]
TMM18990401-V11-04-page18.txt: [('-', '')]
TMM18990401-V11-04-page23.txt: [('-', '')]
TMM18990401-V11-04-page26.txt: [('-and', 'and')]
TMM18990401-V11-04-page29.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18990401-V11-04-page38.txt: [('Indo-', 'Indo'), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990401-V11-04-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-I-', 'I-'), ('PreNit-', 'PreNit'), ('-', ''), ('repre-', 'repre'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM18990501-V11-05-page16.txt: [('-Syria', 'Syria')]
TMM18990501-V11-05-page24.txt: [('of-', 'of')]
TMM18990501-V11-05-page31.txt: [('English-', 'English'), ('French-', 'French')]
TMM18990501-V11-05-page34.txt: [('-SABBATH', 'SABBATH')]
TMM18990501-V11-05-page35.txt: [('-', ''), ('-', '')]
TMM18990501-V11-05-page37.txt: [('-', '')]
TMM18990501-V11-05-page39.txt: [('-', '')]
TMM18990501-V11-05-page41.txt: [('-', '')]
TMM18990501-V11-05-page42.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', ''), ('.-', '.'), ('-', '')]
TMM18990501-V11-05-page43.txt: [('con-', 'con')]
TMM18990501-V11-05-page45.txt: [('--IA', '-IA')]
TMM18990501-V11-05-page46.txt: [('-', ''), ('time.-', 'time.'), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990501-V11-05-page47.txt: [('-.', '.'), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('repre-', 'repre')]
TMM18990601-V11-06-page1.txt: [('-wide', 'wide')]
TMM18990601-V11-06-page11.txt: [('-', ''), ('-every', 'every'), ('fever.-', 'fever.'), ('-no', 'no')]
TMM18990601-V11-06-page12.txt: [('-lines', 'lines')]
TMM18990601-V11-06-page2.txt: [('-is', 'is')]
TMM18990601-V11-06-page3.txt: [('-is', 'is')]
TMM18990601-V11-06-page30.txt: [('-', '')]
TMM18990601-V11-06-page38.txt: [('-', '')]
TMM18990601-V11-06-page39.txt: [('-almost', 'almost'), ('-heathen', 'heathen'), ('-uses', 'uses'), ('-the', 'the'), ('-very', 'very'), ('-', '')]
TMM18990601-V11-06-page4.txt: [('-we', 'we'), ('-DO', 'DO'), ('work.-', 'work.'), ('-for', 'for')]
TMM18990601-V11-06-page46.txt: [('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION'), ('-', '')]
TMM18990601-V11-06-page47.txt: [('-', '')]
TMM18990601-V11-06-page7.txt: [('things.--', 'things.-')]
TMM18990701-V11-07-page11.txt: [('ex-', 'ex'), ('p-', 'p')]
TMM18990701-V11-07-page13.txt: [('-the', 'the')]
TMM18990701-V11-07-page16.txt: [('the-', 'the')]
TMM18990701-V11-07-page17.txt: [('-of', 'of'), ('-these', 'these'), ('-the', 'the'), ('-to', 'to')]
TMM18990701-V11-07-page19.txt: [('reading--', 'reading-')]
TMM18990701-V11-07-page2.txt: [('-the', 'the'), ('-term', 'term'), ('"When-', '"When'), ('-teacher', 'teacher'), ('-to', 'to')]
TMM18990701-V11-07-page20.txt: [('theerec-', 'theerec')]
TMM18990701-V11-07-page23.txt: [('-provision', 'provision'), ('reached-', 'reached'), ('Iztaccihuatl-', 'Iztaccihuatl')]
TMM18990701-V11-07-page26.txt: [('V-', 'V')]
TMM18990701-V11-07-page27.txt: [('so--', 'so-')]
TMM18990701-V11-07-page28.txt: [('-because', 'because')]
TMM18990701-V11-07-page32.txt: [('.-', '.'), ('-', '')]
TMM18990701-V11-07-page36.txt: [('formerly-', 'formerly'), ('receiving-', 'receiving')]
TMM18990701-V11-07-page37.txt: [('pray-', 'pray'), ('-era.', 'era.')]
TMM18990701-V11-07-page4.txt: [('-recent', 'recent')]
TMM18990701-V11-07-page40.txt: [('-', '')]
TMM18990701-V11-07-page42.txt: [('-', ''), ('of-', 'of')]
TMM18990701-V11-07-page43.txt: [('-fices', 'fices')]
TMM18990701-V11-07-page46.txt: [('the-', 'the'), ('MIS-', 'MIS')]
TMM18990701-V11-07-page5.txt: [('gov-', 'gov'), ('-to', 'to')]
TMM18990701-V11-07-page9.txt: [('Fahren-', 'Fahren')]
TMM18990801-V11-08-page11.txt: [('-.', '.'), ('-', ''), ('-', ''), ('-.', '.'), ('-E', 'E'), ('--s', '-s'), ('-"', '"'), ('-I.', 'I.'), ('C".-', 'C".'), ('-t-', 't-'), ('-', ''), ('-C', 'C'), ("-'", "'"), ('-', ''), ('Vcc-', 'Vcc'), ('-', ''), ('-', ''), ('-l', 'l'), ('-c', 'c'), ('-P', 'P'), ('TIc.-', 'TIc.'), ('-Lt.', 'Lt.'), ('a-', 'a'), ('-C', 'C'), ('-.', '.'), ('-c', 'c'), ('-c', 'c'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('m-', 'm'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-c', 'c'), ('..-', '..'), ('-L', 'L'), ('lec-', 'lec'), ('-', ''), ('.F-', '.F'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-se-tt', 'se-tt'), ('N-', 'N'), ('-', ''), ("it'-", "it'"), ('-V', 'V'), ('-', ''), ('-', ''), ('-', ''), ('iV-', 'iV'), ('-', ''), ('-', ''), ('I.-', 'I.'), ('-', ''), ('-', ''), ('-.', '.'), ("-'", "'"), ('--', '-'), ('-it', 'it'), ('mew.Pgx-', 'mew.Pgx'), ("-T'..", "T'.."), ('-', ''), ('lectlf-', 'lectlf'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-.c', '.c'), ('-', ''), ('-.r.', '.r.'), ('-P', 'P'), ('-', ''), ('-', ''), (".''rt.Mgk-", ".''rt.Mgk"), ('-', ''), ('-.', '.'), ('-c', 'c'), ('-', ''), ('-', ''), ('cte-', 'cte'), ("-'", "'"), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('.-', '.'), ('-c', 'c'), ('-', ''), ('Llc-', 'Llc'), ('-', ''), ('-LAE', 'LAE'), ('-', ''), ('-', ''), ('-ore', 'ore')]
TMM18990801-V11-08-page18.txt: [('-', '')]
TMM18990801-V11-08-page2.txt: [('-', '')]
TMM18990801-V11-08-page21.txt: [('-', '')]
TMM18990801-V11-08-page31.txt: [('-upon', 'upon'), ('-that', 'that'), ('-.means', '.means')]
TMM18990801-V11-08-page32.txt: [('-', ''), ('-', '')]
TMM18990801-V11-08-page33.txt: [('-that', 'that'), ('-thousand', 'thousand'), ('-', '')]
TMM18990801-V11-08-page36.txt: [('orr-', 'orr')]
TMM18990801-V11-08-page37.txt: [('-"like', '"like'), ('-that', 'that'), ('-', '')]
TMM18990801-V11-08-page40.txt: [('-that', 'that')]
TMM18990801-V11-08-page41.txt: [('-emptied', 'emptied'), ('Him-', 'Him'), ('-self', 'self'), ('-consume', 'consume'), ('-', '')]
TMM18990801-V11-08-page43.txt: [('-', '')]
TMM18990801-V11-08-page45.txt: [('MIS-', 'MIS'), ('.-', '.'), ('-', ''), ('-', '')]
TMM18990801-V11-08-page46.txt: [('-OFFICE', 'OFFICE'), ('-', ''), ('MIS-', 'MIS')]
TMM18990801-V11-08-page9.txt: [('-', '')]
TMM18990901-V11-09-page1.txt: [('-', '')]
TMM18990901-V11-09-page23.txt: [('-', '')]
TMM18990901-V11-09-page25.txt: [('-the', 'the'), ('-they', 'they'), ('-teach', 'teach')]
TMM18990901-V11-09-page27.txt: [('-to', 'to')]
TMM18990901-V11-09-page3.txt: [('-wa-re', 'wa-re'), ('MISSION-', 'MISSION')]
TMM18990901-V11-09-page34.txt: [('-', '')]
TMM18990901-V11-09-page36.txt: [('.-', '.')]
TMM18990901-V11-09-page44.txt: [('-', '')]
TMM18990901-V11-09-page46.txt: [('-', ''), ('MIS-', 'MIS')]
TMM18990901-V11-09-page47.txt: [('-A--', 'A--'), ('-', ''), ('-al', 'al'), ('-', '')]
TMM18991001-V11-10-page1.txt: [('wit-', 'wit')]
TMM18991001-V11-10-page10.txt: [('--', '-')]
TMM18991001-V11-10-page14.txt: [('-', ''), ('QUEENSLAND.-', 'QUEENSLAND.')]
TMM18991001-V11-10-page16.txt: [('-', '')]
TMM18991001-V11-10-page2.txt: [('-', '')]
TMM18991001-V11-10-page3.txt: [('-that', 'that')]
TMM18991001-V11-10-page30.txt: [('-devote', 'devote')]
TMM18991001-V11-10-page4.txt: [('-', '')]
TMM18991001-V11-10-page44.txt: [('-', ''), ('be-', 'be')]
TMM18991001-V11-10-page45.txt: [('-What', 'What')]
TMM18991001-V11-10-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM18991001-V11-10-page5.txt: [('Mc-', 'Mc'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM18991101-V11-11-page12.txt: [('-we', 'we')]
TMM18991101-V11-11-page23.txt: [('.-', '.'), ("c'te-", "c'te"), ('-', ''), ('i"-', 'i"'), ("-'...", "'..."), ('-', ''), ('F--', 'F-'), ('C-', 'C'), ('-.', '.')]
TMM18991101-V11-11-page24.txt: [('-', ''), ('-', ''), ('-a', 'a')]
TMM18991101-V11-11-page27.txt: [('-to', 'to')]
TMM18991101-V11-11-page32.txt: [('-numbered', 'numbered'), ('-', '')]
TMM18991101-V11-11-page33.txt: [('-holy', 'holy')]
TMM18991101-V11-11-page37.txt: [('MAGA-', 'MAGA')]
TMM18991101-V11-11-page40.txt: [('-', ''), ('---First', '--First')]
TMM18991101-V11-11-page42.txt: [('MIS-', 'MIS')]
TMM18991101-V11-11-page43.txt: [('sur-', 'sur')]
TMM18991101-V11-11-page44.txt: [('-', ''), ('Side-', 'Side')]
TMM18991101-V11-11-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM18991101-V11-11-page47.txt: [('r-', 'r'), ('-', ''), ('A-', 'A')]
TMM18991201-V11-12-page18.txt: [('.-', '.')]
TMM18991201-V11-12-page20.txt: [('-', '')]
TMM18991201-V11-12-page29.txt: [('-', '')]
TMM18991201-V11-12-page3.txt: [('mist-', 'mist')]
TMM18991201-V11-12-page34.txt: [('-than', 'than')]
TMM18991201-V11-12-page36.txt: [('-', '')]
TMM18991201-V11-12-page37.txt: [('-', '')]
TMM18991201-V11-12-page38.txt: [('-', ''), ('-', '')]
TMM18991201-V11-12-page39.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18991201-V11-12-page40.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18991201-V11-12-page41.txt: [('-', ''), ('-', '')]
TMM18991201-V11-12-page42.txt: [('-', '')]
TMM18991201-V11-12-page45.txt: [('-', '')]
TMM18991201-V11-12-page46.txt: [('-', ''), ('MIS-', 'MIS')]
TMM18991201-V11-12-page9.txt: [('-', '')]
TMM19000101-V12-01-page10.txt: [('-i', 'i'), ('-.', '.'), ('-', ''), ('--', '-'), ('---', '--'), ('------', '-----'), ('A-', 'A'), ('-', ''), ('-', ''), ("-.'", ".'"), ('-', ''), ('-', ''), ('..-', '..'), ('-.Z---....', '.Z---....'), ('-', ''), ("--.'b", "-.'b"), ('-', ''), ('X-', 'X'), ("-l'''", "l'''"), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('..-', '..')]
TMM19000101-V12-01-page2.txt: [('con-', 'con')]
TMM19000101-V12-01-page25.txt: [('self-', 'self')]
TMM19000101-V12-01-page3.txt: [('-v-tvot', 'v-tvot')]
TMM19000101-V12-01-page30.txt: [('-drew', 'drew'), ('primi-', 'primi')]
TMM19000101-V12-01-page33.txt: [('-', '')]
TMM19000101-V12-01-page34.txt: [('-', '')]
TMM19000101-V12-01-page37.txt: [('MAGA-', 'MAGA')]
TMM19000101-V12-01-page38.txt: [('MAG-', 'MAG'), ('-', ''), ('-', '')]
TMM19000101-V12-01-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000101-V12-01-page4.txt: [('Nice."-', 'Nice."')]
TMM19000101-V12-01-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('judg-', 'judg'), ('-', '')]
TMM19000101-V12-01-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000101-V12-01-page42.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000101-V12-01-page43.txt: [('-', ''), ('-', '')]
TMM19000101-V12-01-page44.txt: [('MIS-', 'MIS')]
TMM19000101-V12-01-page46.txt: [('-BISHOP', 'BISHOP')]
TMM19000101-V12-01-page47.txt: [('MAGA-', 'MAGA'), ('MISSION-', 'MISSION')]
TMM19000101-V12-01-page48.txt: [('-MISSIONARY', 'MISSIONARY')]
TMM19000101-V12-01-page50.txt: [('MAGA-', 'MAGA'), ('MIS-', 'MIS')]
TMM19000101-V12-01-page51.txt: [('-.Seventh', '.Seventh'), ('-page', 'page'), ('earn-', 'earn'), ('-', ''), ('PRO-', 'PRO')]
TMM19000101-V12-01-page52.txt: [('DEVELOP-', 'DEVELOP')]
TMM19000101-V12-01-page6.txt: [('un-', 'un')]
TMM19000201-V12-02-page1.txt: [('IN-', 'IN')]
TMM19000201-V12-02-page13.txt: [('-', '')]
TMM19000201-V12-02-page15.txt: [('weak-', 'weak')]
TMM19000201-V12-02-page19.txt: [('-cannot', 'cannot')]
TMM19000201-V12-02-page2.txt: [('-sold.', 'sold.')]
TMM19000201-V12-02-page3.txt: [('-', '')]
TMM19000201-V12-02-page32.txt: [('-', '')]
TMM19000201-V12-02-page33.txt: [('-', '')]
TMM19000201-V12-02-page34.txt: [('MAGAZINE-', 'MAGAZINE'), ('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page35.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page36.txt: [('-', ''), ('-', ''), ('Medo-', 'Medo'), ('corre-', 'corre'), ('-each', 'each'), ('-', ''), ('-', '')]
TMM19000201-V12-02-page37.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page38.txt: [('-', ''), ('-', '')]
TMM19000201-V12-02-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-MARCH', 'MARCH')]
TMM19000201-V12-02-page40.txt: [('-', ''), ('-', '')]
TMM19000201-V12-02-page41.txt: [('-', '')]
TMM19000201-V12-02-page43.txt: [('mission-', 'mission')]
TMM19000201-V12-02-page44.txt: [('-farm', 'farm'), ('-their', 'their')]
TMM19000201-V12-02-page46.txt: [('MIS-', 'MIS')]
TMM19000201-V12-02-page47.txt: [('in-', 'in')]
TMM19000201-V12-02-page49.txt: [('Miss-', 'Miss')]
TMM19000201-V12-02-page50.txt: [('-', ''), ('MIS-', 'MIS')]
TMM19000201-V12-02-page51.txt: [('ANIMAL."PRO-', 'ANIMAL."PRO'), ('-', ''), ('-text', 'text'), ('-', ''), ('-tiFFI', 'tiFFI'), ('H.-', 'H.'), ('-', ''), ('-', ''), ('-page', 'page'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page52.txt: [('-', ''), ('al-', 'al')]
TMM19000301-V12-03-page10.txt: [('Hongkong----', 'Hongkong---'), ('-', ''), ('-who', 'who')]
TMM19000301-V12-03-page11.txt: [('-', '')]
TMM19000301-V12-03-page13.txt: [('table-', 'table')]
TMM19000301-V12-03-page18.txt: [('-', ''), ('going-', 'going')]
TMM19000301-V12-03-page2.txt: [('-', ''), ('-', '')]
TMM19000301-V12-03-page26.txt: [('-', '')]
TMM19000301-V12-03-page34.txt: [('-', ''), ('na-', 'na')]
TMM19000301-V12-03-page35.txt: [('-', '')]
TMM19000301-V12-03-page36.txt: [('-', ''), ('-', '')]
TMM19000301-V12-03-page39.txt: [('-', '')]
TMM19000301-V12-03-page41.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page44.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page45.txt: [('Superin-', 'Superin'), ('-', ''), ('thy-', 'thy'), ('--Ider', '-Ider'), ('-school', 'school'), ('-turn', 'turn')]
TMM19000301-V12-03-page47.txt: [('-a', 'a'), ('-a', 'a'), ('C-', 'C')]
TMM19000301-V12-03-page48.txt: [('-', ''), ('-ton', 'ton')]
TMM19000301-V12-03-page5.txt: [('expres-', 'expres'), ('-They', 'They')]
TMM19000301-V12-03-page8.txt: [('Wall-', 'Wall'), ('-', '')]
TMM19000301-V12-03-page9.txt: [('M-', 'M'), ('-c.', 'c.'), ('X-', 'X'), ('---', '--')]
TMM19000401-V12-04-page1.txt: [('-', '')]
TMM19000401-V12-04-page14.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000401-V12-04-page15.txt: [('-', '')]
TMM19000401-V12-04-page2.txt: [('con-', 'con')]
TMM19000401-V12-04-page33.txt: [('-future', 'future')]
TMM19000401-V12-04-page39.txt: [('-', '')]
TMM19000401-V12-04-page40.txt: [('-utmost', 'utmost')]
TMM19000401-V12-04-page43.txt: [('-', '')]
TMM19000401-V12-04-page44.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000401-V12-04-page45.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000401-V12-04-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('expedi-', 'expedi')]
TMM19000401-V12-04-page47.txt: [('-', ''), ('-', '')]
TMM19000401-V12-04-page48.txt: [('-public', 'public')]
TMM19000401-V12-04-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000401-V12-04-page51.txt: [('-r', 'r'), ('-need-to', 'need-to'), ('Sub-', 'Sub'), ('C-', 'C'), ('Postpaid.-', 'Postpaid.'), ('-', '')]
TMM19000401-V12-04-page52.txt: [('-York.', 'York.')]
TMM19000401-V12-04-page7.txt: [('-', ''), ('-', '')]
TMM19000401-V12-04-page8.txt: [('con-', 'con')]
TMM19000501-V12-05-page10.txt: [('Saint-', 'Saint')]
TMM19000501-V12-05-page11.txt: [('-', '')]
TMM19000501-V12-05-page12.txt: [('cere-', 'cere')]
TMM19000501-V12-05-page14.txt: [('examina-', 'examina')]
TMM19000501-V12-05-page15.txt: [('-', '')]
TMM19000501-V12-05-page22.txt: [('-', '')]
TMM19000501-V12-05-page25.txt: [('-', ''), ('---', '--'), ('-------', '------'), ('--.', '-.'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('----', '---'), ('--.', '-.'), ('-', ''), ('-.-.', '.-.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.----', '.---'), ('---', '--'), ('------.', '-----.'), ('-', ''), ('t.---', 't.--'), ('-', ''), ('-', ''), ('----.-', '---.-'), ('-', '')]
TMM19000501-V12-05-page26.txt: [('third-', 'third')]
TMM19000501-V12-05-page29.txt: [('restric-', 'restric')]
TMM19000501-V12-05-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page32.txt: [('MISSION-', 'MISSION')]
TMM19000501-V12-05-page37.txt: [('re-', 're')]
TMM19000501-V12-05-page39.txt: [('-MAY', 'MAY'), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-JUNE', 'JUNE'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page44.txt: [('-', '')]
TMM19000501-V12-05-page45.txt: [('MIS-', 'MIS')]
TMM19000501-V12-05-page5.txt: [('sol-', 'sol'), ('second-', 'second')]
TMM19000501-V12-05-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000501-V12-05-page51.txt: [('A-', 'A'), ('God.-', 'God.'), ('be-', 'be'), ('be-', 'be'), ('-', ''), ('A-', 'A'), ('-', ''), ("'si-", "'si"), ('-', ''), ('A-', 'A'), ('-', ''), ('-', '')]
TMM19000501-V12-05-page6.txt: [('idola-', 'idola')]
TMM19000601-V12-06-page12.txt: [('-', '')]
TMM19000601-V12-06-page13.txt: [('-the', 'the')]
TMM19000601-V12-06-page19.txt: [('ap-', 'ap')]
TMM19000601-V12-06-page23.txt: [('surround-', 'surround')]
TMM19000601-V12-06-page27.txt: [('-', '')]
TMM19000601-V12-06-page28.txt: [('con-', 'con')]
TMM19000601-V12-06-page29.txt: [('amplifi-', 'amplifi'), ('-', '')]
TMM19000601-V12-06-page33.txt: [('the-', 'the')]
TMM19000601-V12-06-page34.txt: [('-', '')]
TMM19000601-V12-06-page37.txt: [('-', ''), ('devasta-', 'devasta')]
TMM19000601-V12-06-page38.txt: [('-acre', 'acre')]
TMM19000601-V12-06-page39.txt: [('-foot', 'foot')]
TMM19000601-V12-06-page44.txt: [('com-', 'com'), ('-', ''), ('-', '')]
TMM19000601-V12-06-page45.txt: [('-', ''), ('-', '')]
TMM19000601-V12-06-page48.txt: [('-', '')]
TMM19000601-V12-06-page49.txt: [('cur-', 'cur')]
TMM19000601-V12-06-page5.txt: [('prepara-', 'prepara')]
TMM19000601-V12-06-page50.txt: [('MIS-', 'MIS')]
TMM19000601-V12-06-page51.txt: [('A-', 'A'), ('A-', 'A'), ('be-', 'be'), ('-', '')]
TMM19000601-V12-06-page52.txt: [('-', '')]
TMM19000701-V12-07-page11.txt: [('reveren-', 'reveren')]
TMM19000701-V12-07-page13.txt: [('mem-', 'mem')]
TMM19000701-V12-07-page14.txt: [('-', '')]
TMM19000701-V12-07-page26.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000701-V12-07-page32.txt: [('con-', 'con')]
TMM19000701-V12-07-page34.txt: [('-', '')]
TMM19000701-V12-07-page36.txt: [('-', '')]
TMM19000701-V12-07-page44.txt: [('READING-', 'READING')]
TMM19000701-V12-07-page46.txt: [('-', ''), ('-', '')]
TMM19000701-V12-07-page47.txt: [('-', ''), ('-', '')]
TMM19000701-V12-07-page48.txt: [('-', '')]
TMM19000701-V12-07-page49.txt: [('-', '')]
TMM19000701-V12-07-page5.txt: [('-though', 'though')]
TMM19000701-V12-07-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000701-V12-07-page51.txt: [('A-', 'A'), ('be-', 'be'), ('A-', 'A')]
TMM19000701-V12-07-page6.txt: [('congrega-', 'congrega')]
TMM19000701-V12-07-page9.txt: [('promul-', 'promul'), ('sub-', 'sub')]
TMM19000801-V12-08-page1.txt: [('-Vol.', 'Vol.')]
TMM19000801-V12-08-page10.txt: [('funda-', 'funda')]
TMM19000801-V12-08-page17.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000801-V12-08-page19.txt: [('road-', 'road'), ('be-', 'be')]
TMM19000801-V12-08-page2.txt: [('con-', 'con')]
TMM19000801-V12-08-page20.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000801-V12-08-page21.txt: [('appar-', 'appar')]
TMM19000801-V12-08-page23.txt: [('-', '')]
TMM19000801-V12-08-page26.txt: [('Astra-', 'Astra')]
TMM19000801-V12-08-page27.txt: [('Tscher-', 'Tscher')]
TMM19000801-V12-08-page28.txt: [('shin-', 'shin')]
TMM19000801-V12-08-page3.txt: [('-', '')]
TMM19000801-V12-08-page31.txt: [('--', '-')]
TMM19000801-V12-08-page33.txt: [('-', '')]
TMM19000801-V12-08-page38.txt: [('-', '')]
TMM19000801-V12-08-page39.txt: [('-', ''), ('-', '')]
TMM19000801-V12-08-page40.txt: [('v-', 'v')]
TMM19000801-V12-08-page44.txt: [('MIS-', 'MIS'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('-', ''), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')]
TMM19000801-V12-08-page45.txt: [('-', ''), ('.-', '.')]
TMM19000801-V12-08-page46.txt: [('-THE', 'THE')]
TMM19000801-V12-08-page47.txt: [('-', ''), ('-', '')]
TMM19000801-V12-08-page48.txt: [('pos-', 'pos')]
TMM19000801-V12-08-page49.txt: [('GA-', 'GA')]
TMM19000801-V12-08-page5.txt: [('View-', 'View'), ('Gos-', 'Gos')]
TMM19000801-V12-08-page50.txt: [('MtssioN-', 'MtssioN'), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000801-V12-08-page7.txt: [('presi-', 'presi')]
TMM19000901-V12-09-page11.txt: [('-', '')]
TMM19000901-V12-09-page16.txt: [('an-', 'an'), ('cere-', 'cere'), ('con-', 'con')]
TMM19000901-V12-09-page24.txt: [('-to', 'to')]
TMM19000901-V12-09-page25.txt: [('-', ''), ('man-', 'man')]
TMM19000901-V12-09-page28.txt: [('di-', 'di')]
TMM19000901-V12-09-page29.txt: [('-return', 'return'), ('--', '-'), ('rep-', 'rep')]
TMM19000901-V12-09-page32.txt: [('-', '')]
TMM19000901-V12-09-page33.txt: [('-one', 'one'), ('adelan-', 'adelan')]
TMM19000901-V12-09-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000901-V12-09-page43.txt: [('-', '')]
TMM19000901-V12-09-page44.txt: [('-SEPTEMBER', 'SEPTEMBER'), ('-', ''), ('-', '')]
TMM19000901-V12-09-page45.txt: [('-', ''), ('-', '')]
TMM19000901-V12-09-page46.txt: [('-twice', 'twice')]
TMM19000901-V12-09-page47.txt: [('Ho-', 'Ho')]
TMM19000901-V12-09-page48.txt: [('-and', 'and'), ('-charge', 'charge'), ('-connection', 'connection')]
TMM19000901-V12-09-page49.txt: [('-', ''), ('-', '')]
TMM19000901-V12-09-page50.txt: [('MIS-', 'MIS'), ('.-', '.')]
TMM19000901-V12-09-page51.txt: [('con-', 'con'), ('-', '')]
TMM19000901-V12-09-page52.txt: [('Blau-', 'Blau')]
TMM19001001-V12-10-page1.txt: [('-i', 'i'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', '')]
TMM19001001-V12-10-page14.txt: [('chil-', 'chil')]
TMM19001001-V12-10-page15.txt: [('--that', '-that')]
TMM19001001-V12-10-page18.txt: [('-', '')]
TMM19001001-V12-10-page2.txt: [('-', ''), ('-Australia', 'Australia')]
TMM19001001-V12-10-page23.txt: [('Pi-', 'Pi')]
TMM19001001-V12-10-page29.txt: [('-', ''), ('hold-', 'hold')]
TMM19001001-V12-10-page30.txt: [('scamper-', 'scamper')]
TMM19001001-V12-10-page44.txt: [('E-', 'E'), ('-', '')]
TMM19001001-V12-10-page45.txt: [('-', '')]
TMM19001001-V12-10-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001001-V12-10-page47.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001001-V12-10-page49.txt: [('-paid', 'paid')]
TMM19001001-V12-10-page50.txt: [('--August', '-August'), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19001001-V12-10-page51.txt: [('-', '')]
TMM19001001-V12-10-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-BEARING', 'BEARING'), ('utensil-', 'utensil')]
TMM19001001-V12-10-page6.txt: [('fellowship-', 'fellowship'), ('influ-', 'influ')]
TMM19001001-V12-10-page8.txt: [('MISSION-', 'MISSION'), ('MAG-', 'MAG'), ('condi-', 'condi')]
TMM19001101-V12-11-page1.txt: [('-', '')]
TMM19001101-V12-11-page13.txt: [('believ-', 'believ')]
TMM19001101-V12-11-page15.txt: [('cab-', 'cab')]
TMM19001101-V12-11-page18.txt: [('igno-', 'igno')]
TMM19001101-V12-11-page19.txt: [('se-', 'se'), ('rever-', 'rever')]
TMM19001101-V12-11-page20.txt: [('French-', 'French')]
TMM19001101-V12-11-page22.txt: [('-United', 'United')]
TMM19001101-V12-11-page24.txt: [('produc-', 'produc'), ('-', ''), ('ap-', 'ap')]
TMM19001101-V12-11-page27.txt: [('-', ''), ('-', '')]
TMM19001101-V12-11-page31.txt: [('-the', 'the')]
TMM19001101-V12-11-page32.txt: [('-', '')]
TMM19001101-V12-11-page34.txt: [('-', '')]
TMM19001101-V12-11-page38.txt: [('-NovEmnEn', 'NovEmnEn'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page39.txt: [('-', ''), ('-Our', 'Our'), ('-', '')]
TMM19001101-V12-11-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page41.txt: [('-That', 'That'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page42.txt: [('-DECEMBER', 'DECEMBER'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page43.txt: [('-', '')]
TMM19001101-V12-11-page44.txt: [('-', ''), ('-', ''), ('MIS-', 'MIS'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')]
TMM19001101-V12-11-page45.txt: [('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')]
TMM19001101-V12-11-page46.txt: [('-', ''), ('-', ''), ('.-', '.')]
TMM19001101-V12-11-page48.txt: [('-THE', 'THE')]
TMM19001101-V12-11-page49.txt: [('Sabbath-', 'Sabbath')]
TMM19001101-V12-11-page50.txt: [('MIS-', 'MIS')]
TMM19001101-V12-11-page51.txt: [('-', ''), ('makeCon-', 'makeCon'), ('-cured.', 'cured.'), ('.successful-', '.successful'), ('-', ''), ('-', ''), ('-name', 'name')]
TMM19001101-V12-11-page52.txt: [('-ought', 'ought'), ('-BEARING', 'BEARING'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ("-'not", "'not"), ('-With', 'With'), ('--', '-')]
TMM19001101-V12-11-page8.txt: [('-', '')]
TMM19001201-V12-12-page10.txt: [('ba-', 'ba')]
TMM19001201-V12-12-page14.txt: [('-', ''), ('-lb-', 'lb-'), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page19.txt: [('-', '')]
TMM19001201-V12-12-page2.txt: [('-Oakland', 'Oakland')]
TMM19001201-V12-12-page23.txt: [('-', '')]
TMM19001201-V12-12-page3.txt: [('Spirit-', 'Spirit')]
TMM19001201-V12-12-page30.txt: [('-', ''), ('-', '')]
TMM19001201-V12-12-page31.txt: [('lo-', 'lo')]
TMM19001201-V12-12-page34.txt: [('blast--', 'blast-')]
TMM19001201-V12-12-page35.txt: [('-', ''), ('-', '')]
TMM19001201-V12-12-page37.txt: [('under-', 'under')]
TMM19001201-V12-12-page39.txt: [('-the', 'the')]
TMM19001201-V12-12-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page44.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page45.txt: [('-', ''), ('-', '')]
TMM19001201-V12-12-page47.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page48.txt: [('-Oun', 'Oun')]
TMM19001201-V12-12-page49.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page50.txt: [('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19001201-V12-12-page51.txt: [('-', '')]
TMM19001201-V12-12-page6.txt: [('an-', 'an'), ('Ad-', 'Ad'), ('-', '')]
TMM19001201-V12-12-page7.txt: [('advance-', 'advance')]
TMM19020101-V14-01-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-.New', '.New'), ('-', ''), ('-', ''), ('-The', 'The'), ('-', ''), ('-', ''), ('Alaska-', 'Alaska'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.Mayaguez', '.Mayaguez'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('U-', 'U')]
TMM19020101-V14-01-page12.txt: [('Natal-', 'Natal')]
TMM19020101-V14-01-page13.txt: [('salva-', 'salva')]
TMM19020101-V14-01-page15.txt: [('re-', 're')]
TMM19020101-V14-01-page16.txt: [('-', ''), ('-', '')]
TMM19020101-V14-01-page17.txt: [('resur-', 'resur')]
TMM19020101-V14-01-page18.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020101-V14-01-page2.txt: [('LIFT-', 'LIFT'), ('-PRICE', 'PRICE'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('l-', 'l')]
TMM19020101-V14-01-page23.txt: [('impos-', 'impos')]
TMM19020101-V14-01-page27.txt: [('dis-', 'dis')]
TMM19020101-V14-01-page29.txt: [('-', '')]
TMM19020101-V14-01-page31.txt: [('-', ''), ('Pe-', 'Pe')]
TMM19020101-V14-01-page38.txt: [('heav-', 'heav')]
TMM19020101-V14-01-page49.txt: [('nec-', 'nec')]
TMM19020101-V14-01-page50.txt: [('-', ''), ('SECOND-', 'SECOND'), ('MAG-', 'MAG')]
TMM19020101-V14-01-page7.txt: [('grow-', 'grow')]
TMM19020101-V14-01-page8.txt: [('-', ''), ('-page', 'page')]
TMM19020201-V14-02-page1.txt: [('-ii', 'ii'), ('-', '')]
TMM19020201-V14-02-page12.txt: [('-', ''), ('Cama-', 'Cama')]
TMM19020201-V14-02-page14.txt: [('jus-', 'jus')]
TMM19020201-V14-02-page15.txt: [('-', '')]
TMM19020201-V14-02-page2.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19020201-V14-02-page21.txt: [('MISSION-', 'MISSION')]
TMM19020201-V14-02-page25.txt: [('-eyes', 'eyes'), ('Ital-', 'Ital')]
TMM19020201-V14-02-page33.txt: [('neces-', 'neces')]
TMM19020201-V14-02-page35.txt: [('-', '')]
TMM19020201-V14-02-page38.txt: [('Erken-', 'Erken')]
TMM19020201-V14-02-page45.txt: [('-', '')]
TMM19020201-V14-02-page46.txt: [('Okla-', 'Okla')]
TMM19020201-V14-02-page47.txt: [('Fund.-', 'Fund.'), ('-', ''), ('Relief.-', 'Relief.'), ('Tithe.-', 'Tithe.'), ('Donations.-', 'Donations.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('School.-', 'School.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Field.-', 'Field.'), ('Africa.-', 'Africa.'), ('Mission.-', 'Mission.')]
TMM19020201-V14-02-page49.txt: [('-', ''), ('ARIZONA.-', 'ARIZONA.'), ('CALIFORNIA.-', 'CALIFORNIA.'), ('-', ''), ('CUMBER-', 'CUMBER'), ('FLORIDA.-', 'FLORIDA.'), ('GEORGIA.-', 'GEORGIA.'), ('ILLINOIS.-', 'ILLINOIS.'), ('TERRITORY.-', 'TERRITORY.'), ('KANSAS.-', 'KANSAS.'), ('LOUISI-', 'LOUISI'), ('ANA.-', 'ANA.'), ('MIcHIGAN.-', 'MIcHIGAN.'), ('MINNESOTA.-', 'MINNESOTA.'), ('MISSOURI.-', 'MISSOURI.'), ('NE-', 'NE'), ('BRASKA.-', 'BRASKA.'), ('YORK.-', 'YORK.'), ('CAROLINA.-', 'CAROLINA.'), ('Oxio.-', 'Oxio.'), ('TERRITORY.-', 'TERRITORY.'), ('OREGON.-', 'OREGON.'), ('PENNSYLVANIA.-', 'PENNSYLVANIA.'), ('DAKOTA.-', 'DAKOTA.'), ('TEXAS.-', 'TEXAS.'), ('VERMONT.-', 'VERMONT.'), ('-', ''), ('VIRGINIA.-', 'VIRGINIA.'), ('g.-', 'g.')]
TMM19020201-V14-02-page5.txt: [('-', '')]
TMM19020201-V14-02-page50.txt: [('SECOND-', 'SECOND'), ('Expirations.-', 'Expirations.'), ('MAG-', 'MAG'), ('TEUTONIC-', 'TEUTONIC'), ('CELTIC-', 'CELTIC'), ('-', '')]
TMM19020201-V14-02-page51.txt: [('-', '')]
TMM19020201-V14-02-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('RAIL-', 'RAIL')]
TMM19020201-V14-02-page7.txt: [('Holland-', 'Holland')]
TMM19020201-V14-02-page8.txt: [('-', ''), ('Advent-', 'Advent')]
TMM19020301-V14-03-page1.txt: [('--', '-'), ('-', ''), ('-The', 'The'), ('Hungary-', 'Hungary'), ('-', ''), ('-In', 'In'), ('L-', 'L'), ('-', ''), ('--', '-'), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Tuna-', 'Tuna'), ('-', ''), ("'Children-", "'Children")]
TMM19020301-V14-03-page10.txt: [('country-', 'country'), ('prin-', 'prin')]
TMM19020301-V14-03-page12.txt: [('com-', 'com')]
TMM19020301-V14-03-page14.txt: [('meet-', 'meet'), ('de-', 'de')]
TMM19020301-V14-03-page17.txt: [('Mon-', 'Mon')]
TMM19020301-V14-03-page2.txt: [('-', ''), ('-Apply', 'Apply'), ('WathiOR-', 'WathiOR'), ('-', ''), ('-', ''), ('.-', '.')]
TMM19020301-V14-03-page21.txt: [('increas-', 'increas')]
TMM19020301-V14-03-page22.txt: [('-', '')]
TMM19020301-V14-03-page26.txt: [('-', ''), ('appear-', 'appear'), ('-', ''), ('-miles', 'miles'), ('-', ''), ('Ar-', 'Ar')]
TMM19020301-V14-03-page27.txt: [('mem-', 'mem')]
TMM19020301-V14-03-page3.txt: [('RAIL-', 'RAIL'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020301-V14-03-page31.txt: [('-', '')]
TMM19020301-V14-03-page35.txt: [('propor-', 'propor')]
TMM19020301-V14-03-page39.txt: [('IN-', 'IN')]
TMM19020301-V14-03-page40.txt: [('-A', 'A')]
TMM19020301-V14-03-page43.txt: [('-a', 'a')]
TMM19020301-V14-03-page45.txt: [('-gain', 'gain'), ('dark-', 'dark')]
TMM19020301-V14-03-page47.txt: [('-pressed', 'pressed')]
TMM19020301-V14-03-page49.txt: [('-', '')]
TMM19020301-V14-03-page5.txt: [('A-', 'A')]
TMM19020301-V14-03-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020301-V14-03-page51.txt: [('Foun-', 'Foun'), ('-', ''), ('-inch', 'inch'), ('-inch', 'inch'), ('-in.', 'in.'), ('-in.', 'in.'), ('-', '')]
TMM19020301-V14-03-page52.txt: [('-', ''), ('-', '')]
TMM19020301-V14-03-page7.txt: [('---', '--'), ('-.', '.'), ('-i', 'i'), ('-.', '.'), ('-----', '----'), ('-', ''), ('.-', '.'), ('----', '---'), ('-', ''), ("--'---", "-'---"), ('-', ''), ('-', ''), ('-', ''), ('f--', 'f-')]
TMM19020301-V14-03-page8.txt: [('-', '')]
TMM19020301-V14-03-page9.txt: [('--or', '-or')]
TMM19020401-V14-04-page1.txt: [('-.', '.'), ('.CONTENTSib-', '.CONTENTSib'), ('-', ''), ('-', '')]
TMM19020401-V14-04-page11.txt: [('-strong', 'strong')]
TMM19020401-V14-04-page13.txt: [('-', '')]
TMM19020401-V14-04-page16.txt: [('p-a--', 'p-a-'), ('-arr', 'arr'), ('-', ''), ('-e', 'e'), ('-', ''), ('-', ''), ("--C'rA", "-C'rA"), ('-X', 'X')]
TMM19020401-V14-04-page2.txt: [('-', '')]
TMM19020401-V14-04-page24.txt: [('re-', 're')]
TMM19020401-V14-04-page3.txt: [('RAIL-', 'RAIL'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020401-V14-04-page34.txt: [('-', ''), ('expe-', 'expe')]
TMM19020401-V14-04-page43.txt: [('institu-', 'institu')]
TMM19020401-V14-04-page46.txt: [('Brother-', 'Brother')]
TMM19020401-V14-04-page49.txt: [('accord-', 'accord')]
TMM19020401-V14-04-page50.txt: [('SECOND-', 'SECOND'), ('-', '')]
TMM19020401-V14-04-page52.txt: [('-.', '.')]
TMM19020401-V14-04-page7.txt: [('At-', 'At'), ('-rtitxm', 'rtitxm'), ('kk-t-', 'kk-t'), ('-', ''), ('.z-', '.z'), ('wt-', 'wt'), ('m-', 'm'), ('-', ''), ('t-', 't')]
TMM19020401-V14-04-page8.txt: [('--', '-')]
TMM19020501-V14-05-page1.txt: [('.-', '.'), ('-Jamaica', 'Jamaica'), ('-', ''), ('-', ''), ('-.', '.')]
TMM19020501-V14-05-page10.txt: [('sta-', 'sta')]
TMM19020501-V14-05-page11.txt: [('-', ''), ('suc-', 'suc')]
TMM19020501-V14-05-page13.txt: [('moun-', 'moun')]
TMM19020501-V14-05-page16.txt: [('-in', 'in')]
TMM19020501-V14-05-page17.txt: [('mis-', 'mis')]
TMM19020501-V14-05-page2.txt: [('-', ''), ('AS--', 'AS-'), ('-OUR', 'OUR'), ('-gives', 'gives')]
TMM19020501-V14-05-page24.txt: [('-first-day', 'first-day'), ('-', '')]
TMM19020501-V14-05-page25.txt: [('-', ''), ('-', '')]
TMM19020501-V14-05-page27.txt: [('out-', 'out')]
TMM19020501-V14-05-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('RAIL-', 'RAIL')]
TMM19020501-V14-05-page31.txt: [('in-', 'in')]
TMM19020501-V14-05-page35.txt: [('Method-', 'Method')]
TMM19020501-V14-05-page36.txt: [('-', '')]
TMM19020501-V14-05-page4.txt: [('-', '')]
TMM19020501-V14-05-page40.txt: [('experi-', 'experi')]
TMM19020501-V14-05-page42.txt: [('-us', 'us')]
TMM19020501-V14-05-page43.txt: [('LATER.-', 'LATER.')]
TMM19020501-V14-05-page45.txt: [('ex-', 'ex')]
TMM19020501-V14-05-page46.txt: [('-', ''), ('Con-', 'Con')]
TMM19020501-V14-05-page47.txt: [('-', ''), ('Relief.-', 'Relief.'), ('Sanatorium.-', 'Sanatorium.'), ('-', ''), ('Tithe.-', 'Tithe.'), ('Conference.-', 'Conference.'), ('Mission.-', 'Mission.'), ('Conference.-', 'Conference.'), ('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('Mission.-', 'Mission.'), ('Italy.-', 'Italy.'), ('-', ''), ('-', ''), ('Mission.-', 'Mission.'), ('-', ''), ('Mission.-', 'Mission.'), ('-', ''), ('-California', 'California'), ('-', ''), ('Conference.-', 'Conference.'), ('Conference.-', 'Conference.'), ('-', ''), ('-Iowa', 'Iowa'), ('-', ''), ('-Minnesota', 'Minnesota')]
TMM19020501-V14-05-page48.txt: [('-', ''), ('-', ''), ('-', ''), ('Fund.-', 'Fund.'), ('-', ''), ('-', ''), ('BENEV-', 'BENEV'), ('-', '')]
TMM19020501-V14-05-page5.txt: [('I-', 'I')]
TMM19020501-V14-05-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020501-V14-05-page51.txt: [('or.-', 'or.'), ('-wl', 'wl'), ('--', '-')]
TMM19020501-V14-05-page52.txt: [('-', ''), ('-', '')]
TMM19020501-V14-05-page6.txt: [('key-', 'key')]
TMM19020501-V14-05-page7.txt: [('MAGA-', 'MAGA')]
TMM19020501-V14-05-page8.txt: [('pro-', 'pro')]
In [30]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction3 Average verified rate: 0.9839672985814993 Average of error rates: 0.023232207792207794 Total token count: 861614
In [31]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[31]:
[("'", 583),
('e', 484),
('w', 476),
('m', 341),
('t', 326),
('r', 309),
('d', 302),
('n', 298),
('f', 269),
('g', 250),
('th', 109),
('x', 75),
('co', 70),
('k', 66),
('pa', 64),
('u', 64),
('z', 61),
('mis', 42),
('io', 42),
('oc', 40),
('oo', 33),
('cc', 29),
('sionary', 29),
('re', 25),
('al', 23),
("'the", 23),
('q', 22),
('mt', 20),
('hausaland', 19),
('id', 19),
("''", 19),
('stauffer', 19),
('ary', 19),
('basle', 18),
('zo', 18),
('ft', 18),
('mo', 18),
('couva', 17),
('kalaka', 17),
('hasegawa', 17),
('sul', 17),
('okohira', 16),
('ro', 16),
('pp', 15),
('helsingfors', 15),
('sabbathschool', 15),
("hours'", 15),
('te', 15),
('schwantes', 15),
('raiatea', 15)]
Correction 4 -- Remove Extra Quotation Marks¶
In [33]:
# %load shared_elements/remove_extra_quotation_marks.py
prev = cycle
cycle = "correction4"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
corrections = []
for token in tokens:
token_list = list(token)
last_char = token_list[-1]
if last_char is "'":
if len(token) > 1:
if token_list[-2] is 's' or 'S':
pass
else:
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
elif token[0] is "'":
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
if len(corrections) > 0:
print('{}: {}'.format(filename, corrections))
for correction in corrections:
content = clean.replace_pair(correction, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980101-V10-01-page11.txt: [("'tis", 'tis')]
TMM18980101-V10-01-page12.txt: [("'his", 'his')]
TMM18980101-V10-01-page13.txt: [("'nough", 'nough')]
TMM18980101-V10-01-page16.txt: [("'The", 'The')]
TMM18980101-V10-01-page3.txt: [("'Redeemer", 'Redeemer')]
TMM18980101-V10-01-page4.txt: [("'YWVP", 'YWVP')]
TMM18980101-V10-01-page5.txt: [("'the", 'the'), ("'which", 'which'), ("'sphere", 'sphere')]
TMM18980201-V10-02-page14.txt: [("'my", 'my')]
TMM18980201-V10-02-page34.txt: [("'Contemplated", 'Contemplated')]
TMM18980301-V10-03-page6.txt: [("'t", 't')]
TMM18980401-V10-04-page12.txt: [("'put", 'put')]
TMM18980401-V10-04-page13.txt: [("'a", 'a')]
TMM18980401-V10-04-page17.txt: [("'The", 'The')]
TMM18980401-V10-04-page22.txt: [("'one", 'one'), ("'of", 'of')]
TMM18980401-V10-04-page31.txt: [("'s", 's'), ("'s", 's'), ("'s", 's'), ("'out", 'out')]
TMM18980401-V10-04-page4.txt: [("'S", 'S')]
TMM18980501-V10-05-page14.txt: [("'appreciate", 'appreciate')]
TMM18980501-V10-05-page28.txt: [("'g", 'g'), ("'f", 'f')]
TMM18980501-V10-05-page31.txt: [("'HYATT.", 'HYATT.')]
TMM18980501-V10-05-page40.txt: [("'God", 'God')]
TMM18980501-V10-05-page7.txt: [("'tis", 'tis')]
TMM18980601-V10-06-page20.txt: [("'.", '.'), ("'I", 'I'), ("'t", 't'), ("'..", '..')]
TMM18980601-V10-06-page25.txt: [("'Ye", 'Ye')]
TMM18980601-V10-06-page26.txt: [("'Here", 'Here')]
TMM18980601-V10-06-page32.txt: [("'countries.", 'countries.')]
TMM18980601-V10-06-page36.txt: [("'business", 'business')]
TMM18980601-V10-06-page5.txt: [("'forward", 'forward')]
TMM18980701-V10-07-page23.txt: [("'some", 'some')]
TMM18980701-V10-07-page25.txt: [("'the", 'the')]
TMM18980701-V10-07-page38.txt: [("'.", '.')]
TMM18980701-V10-07-page39.txt: [("'Signs", 'Signs')]
TMM18980701-V10-07-page40.txt: [("'Signs", 'Signs')]
TMM18980701-V10-07-page7.txt: [("'AlI", 'AlI')]
TMM18980801-V10-08-page11.txt: [("'them", 'them')]
TMM18980801-V10-08-page30.txt: [("'Come", 'Come')]
TMM18980801-V10-08-page31.txt: [("'is", 'is')]
TMM18980901-V10-09-page35.txt: [("'M.", 'M.')]
TMM18980901-V10-09-page7.txt: [("'Praise", 'Praise'), ("'Make", 'Make'), ("'I", 'I')]
TMM18981001-V10-10-page11.txt: [("'Reformation", 'Reformation'), ("'of", 'of')]
TMM18981001-V10-10-page13.txt: [("'at", 'at')]
TMM18981001-V10-10-page15.txt: [("'villas", 'villas')]
TMM18981001-V10-10-page21.txt: [("'Salvation", 'Salvation')]
TMM18981001-V10-10-page37.txt: [("'il", 'il'), ("'A", 'A')]
TMM18981001-V10-10-page6.txt: [("'eternal", 'eternal')]
TMM18981101-V10-11-page15.txt: [("'divine", 'divine')]
TMM18981101-V10-11-page17.txt: [("'Great", 'Great')]
TMM18981101-V10-11-page18.txt: [("'foreigner", 'foreigner'), ("'Corn", 'Corn'), ("'Great", 'Great')]
TMM18981101-V10-11-page22.txt: [("'native", 'native')]
TMM18981101-V10-11-page26.txt: [("'new", 'new')]
TMM18981101-V10-11-page32.txt: [('\'Creature."', 'Creature."')]
TMM18981101-V10-11-page37.txt: [("'.", '.')]
TMM18981101-V10-11-page5.txt: [("'authority", 'authority')]
TMM18981101-V10-11-page9.txt: [("'a", 'a')]
TMM18981201-V10-12-page10.txt: [("'no", 'no')]
TMM18981201-V10-12-page11.txt: [("'not", 'not')]
TMM18981201-V10-12-page31.txt: [("'he", 'he')]
TMM18981201-V10-12-page4.txt: [("'the", 'the')]
TMM18981201-V10-12-page41.txt: [("'tis", 'tis'), ("'twill", 'twill')]
TMM18981201-V10-12-page43.txt: [("'liberty", 'liberty')]
TMM18981201-V10-12-page7.txt: [("'Nile", 'Nile')]
TMM18990101-V11-01-page14.txt: [("'peons", 'peons'), ("'The'gold", 'Thegold'), ("'trees", 'trees')]
TMM18990101-V11-01-page20.txt: [("'our", 'our')]
TMM18990101-V11-01-page34.txt: [("'southeastern", 'southeastern')]
TMM18990101-V11-01-page39.txt: [("'If", 'If')]
TMM18990101-V11-01-page41.txt: [("'why", 'why')]
TMM18990101-V11-01-page47.txt: [("'Bible", 'Bible')]
TMM18990201-V11-02-page11.txt: [("'Battle", 'Battle')]
TMM18990201-V11-02-page31.txt: [("'Gather", 'Gather')]
TMM18990201-V11-02-page49.txt: [("'whom", 'whom')]
TMM18990201-V11-02-page51.txt: [("'or", 'or')]
TMM18990201-V11-02-page55.txt: [("'i", 'i'), ("'I", 'I')]
TMM18990301-V11-03-page14.txt: [("'the", 'the')]
TMM18990301-V11-03-page16.txt: [("'best", 'best')]
TMM18990301-V11-03-page26.txt: [("'He", 'He'), ("'is", 'is')]
TMM18990301-V11-03-page28.txt: [("'to", 'to')]
TMM18990301-V11-03-page37.txt: [("'acquainted", 'acquainted')]
TMM18990401-V11-04-page1.txt: [("'VOL.", 'VOL.')]
TMM18990401-V11-04-page3.txt: [("'hands", 'hands'), ("'Of", 'Of'), ("'Mercy", 'Mercy'), ("'drawn", 'drawn'), ("'culminate", 'culminate')]
TMM18990401-V11-04-page7.txt: [("'with", 'with')]
TMM18990501-V11-05-page35.txt: [("'cause", 'cause')]
TMM18990501-V11-05-page47.txt: [("'i", 'i')]
TMM18990601-V11-06-page10.txt: [("'very", 'very')]
TMM18990601-V11-06-page11.txt: [("'one", 'one'), ("'each", 'each')]
TMM18990601-V11-06-page12.txt: [("'produce", 'produce')]
TMM18990601-V11-06-page24.txt: [("'the", 'the')]
TMM18990601-V11-06-page29.txt: [("'northwest", 'northwest')]
TMM18990601-V11-06-page31.txt: [("'another", 'another')]
TMM18990701-V11-07-page11.txt: [("'Central", 'Central')]
TMM18990701-V11-07-page17.txt: [("'These", 'These')]
TMM18990701-V11-07-page2.txt: [("'The", 'The'), ("'God", 'God')]
TMM18990701-V11-07-page25.txt: [("'Church", 'Church')]
TMM18990701-V11-07-page33.txt: [("'Tis", 'Tis')]
TMM18990701-V11-07-page37.txt: [("'baptism", 'baptism')]
TMM18990701-V11-07-page39.txt: [("'Surely", 'Surely'), ("'Why", 'Why')]
TMM18990701-V11-07-page4.txt: [("'China", 'China'), ("'will", 'will')]
TMM18990701-V11-07-page40.txt: [("'Germany", 'Germany')]
TMM18990701-V11-07-page42.txt: [("'We", 'We')]
TMM18990701-V11-07-page46.txt: [("'row", 'row')]
TMM18990701-V11-07-page47.txt: [("''ettntIV", 'ettntIV')]
TMM18990801-V11-08-page11.txt: [("'R.", 'R.'), ("'.", '.'), ("'ft", 'ft'), ("'.", '.'), ("'.", '.'), ("'.", '.'), ("'C", 'C'), ("'RIM", 'RIM'), ("'iOnNA", 'iOnNA'), ("'Ct.", 'Ct.'), ("'cc", 'cc'), ("'..", '..'), ("'L.", 'L.'), ("'W", 'W'), ("'Mg", 'Mg'), ("'t", 't'), ("'.", '.'), ("'Co", 'Co'), ("'lgl", 'lgl'), ("'rt", 'rt'), ("'cCc", 'cCc')]
TMM18990801-V11-08-page37.txt: [("'i'Selected", 'iSelected')]
TMM18990901-V11-09-page1.txt: [("'ve", 've')]
TMM18990901-V11-09-page12.txt: [("'that", 'that')]
TMM18990901-V11-09-page25.txt: [("'Well", 'Well')]
TMM18990901-V11-09-page42.txt: [("'The", 'The')]
TMM18990901-V11-09-page43.txt: [("'Send", 'Send'), ("'comprehend", 'comprehend'), ("'Neglected", 'Neglected'), ("'Here", 'Here')]
TMM18991001-V11-10-page11.txt: [("'and", 'and'), ("'more", 'more')]
TMM18991001-V11-10-page22.txt: [("'to", 'to')]
TMM18991001-V11-10-page27.txt: [("'mission", 'mission')]
TMM18991001-V11-10-page30.txt: [("'licensed", 'licensed')]
TMM18991001-V11-10-page4.txt: [("'to", 'to'), ("'selected", 'selected')]
TMM18991001-V11-10-page41.txt: [("'giving", 'giving')]
TMM18991001-V11-10-page42.txt: [("'twelve", 'twelve')]
TMM18991001-V11-10-page43.txt: [("'How", 'How')]
TMM18991001-V11-10-page45.txt: [("'Stich", 'Stich')]
TMM18991101-V11-11-page23.txt: [("'...", '...'), ("'Z", 'Z')]
TMM18991101-V11-11-page24.txt: [("'Many", 'Many'), ("'going", 'going')]
TMM18991101-V11-11-page7.txt: [("'is", 'is')]
TMM18991201-V11-12-page24.txt: [("'filled", 'filled')]
TMM18991201-V11-12-page28.txt: [("'presidents", 'presidents')]
TMM18991201-V11-12-page31.txt: [("'Whosoever", 'Whosoever')]
TMM18991201-V11-12-page40.txt: [("'summer's", 'summers')]
TMM18991201-V11-12-page45.txt: [("'benefited", 'benefited')]
TMM19000101-V12-01-page1.txt: [("'GREECE", 'GREECE')]
TMM19000101-V12-01-page10.txt: [("'qie", 'qie'), ("'o", 'o')]
TMM19000101-V12-01-page29.txt: [("'blue", 'blue')]
TMM19000101-V12-01-page31.txt: [("'climbing", 'climbing')]
TMM19000101-V12-01-page32.txt: [("'twere", 'twere')]
TMM19000101-V12-01-page34.txt: [("'We", 'We')]
TMM19000101-V12-01-page35.txt: [("'I", 'I')]
TMM19000101-V12-01-page46.txt: [("'make", 'make')]
TMM19000101-V12-01-page48.txt: [("'Thou", 'Thou'), ("'No", 'No')]
TMM19000101-V12-01-page52.txt: [("'BILLS", 'BILLS'), ("'TRANSFER", 'TRANSFER')]
TMM19000201-V12-02-page1.txt: [("'THE", 'THE')]
TMM19000201-V12-02-page30.txt: [("'Shall", 'Shall')]
TMM19000201-V12-02-page39.txt: [("'the", 'the')]
TMM19000201-V12-02-page5.txt: [('\'"and', '"and')]
TMM19000201-V12-02-page51.txt: [("'Milk", 'Milk'), ("'COTTItEri", 'COTTItEri'), ("'BrOOkbtli", 'BrOOkbtli'), ("'clean", 'clean')]
TMM19000301-V12-03-page17.txt: [("'Go", 'Go')]
TMM19000301-V12-03-page2.txt: [("'God", 'God')]
TMM19000301-V12-03-page31.txt: [("'Well", 'Well')]
TMM19000301-V12-03-page34.txt: [("'not", 'not'), ("'powers", 'powers')]
TMM19000301-V12-03-page36.txt: [("'Here", 'Here')]
TMM19000301-V12-03-page47.txt: [("'upon", 'upon'), ("'Come", 'Come')]
TMM19000301-V12-03-page48.txt: [("'well", 'well')]
TMM19000301-V12-03-page5.txt: [("'the", 'the')]
TMM19000301-V12-03-page9.txt: [("''j", 'j')]
TMM19000401-V12-04-page15.txt: [("'I", 'I')]
TMM19000401-V12-04-page16.txt: [("'centuries", 'centuries')]
TMM19000401-V12-04-page29.txt: [("'suppose", 'suppose')]
TMM19000401-V12-04-page33.txt: [("'Christ", 'Christ')]
TMM19000401-V12-04-page51.txt: [("'s", 's'), ("'our", 'our'), ("'Come", 'Come')]
TMM19000401-V12-04-page52.txt: [("'Tis", 'Tis')]
TMM19000501-V12-05-page1.txt: [("'CIRCLE", 'CIRCLE')]
TMM19000501-V12-05-page2.txt: [("'lasso", 'lasso')]
TMM19000501-V12-05-page32.txt: [("'Lord", 'Lord')]
TMM19000501-V12-05-page39.txt: [("'time", 'time')]
TMM19000501-V12-05-page42.txt: [("'Years", 'Years')]
TMM19000501-V12-05-page51.txt: [("'lath", 'lath'), ("'that", 'that'), ("'si", 'si'), ("'The", 'The'), ("'because", 'because')]
TMM19000501-V12-05-page52.txt: [("'Tis", 'Tis')]
TMM19000501-V12-05-page6.txt: [("'Tis", 'Tis')]
TMM19000601-V12-06-page1.txt: [("'A", 'A'), ("'Nassau", 'Nassau')]
TMM19000601-V12-06-page11.txt: [("'Come", 'Come')]
TMM19000601-V12-06-page14.txt: [("'little", 'little')]
TMM19000601-V12-06-page52.txt: [("'Tis", 'Tis')]
TMM19000701-V12-07-page10.txt: [("'The", 'The')]
TMM19000701-V12-07-page12.txt: [("'If", 'If'), ("'Ever", 'Ever')]
TMM19000701-V12-07-page13.txt: [("'of", 'of')]
TMM19000701-V12-07-page46.txt: [("'Well", 'Well')]
TMM19000701-V12-07-page5.txt: [("'twas", 'twas')]
TMM19000701-V12-07-page52.txt: [("'Tis", 'Tis')]
TMM19000701-V12-07-page7.txt: [("'ping", 'ping')]
TMM19000801-V12-08-page11.txt: [("'to", 'to')]
TMM19000801-V12-08-page13.txt: [("'Mohammedan", 'Mohammedan')]
TMM19000801-V12-08-page2.txt: [("'mw", 'mw')]
TMM19000801-V12-08-page22.txt: [("'The", 'The')]
TMM19000801-V12-08-page23.txt: [("'Sufficient", 'Sufficient')]
TMM19000801-V12-08-page24.txt: [("'Have", 'Have')]
TMM19000801-V12-08-page29.txt: [("'luminated", 'luminated')]
TMM19000801-V12-08-page36.txt: [("'In", 'In')]
TMM19000801-V12-08-page41.txt: [("'Casting", 'Casting'), ("'Cast", 'Cast'), ("'Hast", 'Hast'), ("'Commit", 'Commit'), ("'are", 'are')]
TMM19000801-V12-08-page42.txt: [("'Underneath", 'Underneath')]
TMM19000801-V12-08-page47.txt: [("'United", 'United')]
TMM19000801-V12-08-page52.txt: [("'Tis", 'Tis'), ("'enA", 'enA')]
TMM19000801-V12-08-page6.txt: [("'Behold", 'Behold')]
TMM19000901-V12-09-page12.txt: [("'For", 'For'), ("'The", 'The'), ("'for", 'for'), ("'For", 'For'), ("'The", 'The'), ("'For", 'For'), ("'The", 'The'), ("'Though", 'Though')]
TMM19000901-V12-09-page16.txt: [("'hall", 'hall')]
TMM19000901-V12-09-page43.txt: [("'These", 'These')]
TMM19000901-V12-09-page45.txt: [("'T", 'T')]
TMM19000901-V12-09-page49.txt: [("'is", 'is')]
TMM19000901-V12-09-page52.txt: [("'for", 'for'), ("'Tis", 'Tis')]
TMM19000901-V12-09-page6.txt: [("'send", 'send'), ("'missionary", 'missionary')]
TMM19000901-V12-09-page7.txt: [("'missionary", 'missionary')]
TMM19001001-V12-10-page1.txt: [("'PERFECT", 'PERFECT'), ("'CHINA", 'CHINA'), ("'INDEPENDENCE", 'INDEPENDENCE'), ("'I.", 'I.'), ("'LETTERS", 'LETTERS'), ("'PUBLISHED", 'PUBLISHED')]
TMM19001001-V12-10-page16.txt: [("'When", 'When')]
TMM19001001-V12-10-page18.txt: [("'Pearly", 'Pearly')]
TMM19001001-V12-10-page19.txt: [("'way", 'way'), ("'Old", 'Old'), ("'All", 'All'), ("'Reason", 'Reason')]
TMM19001001-V12-10-page49.txt: [("'The", 'The'), ("'are", 'are'), ("'knows", 'knows')]
TMM19001001-V12-10-page51.txt: [("'York.", 'York.')]
TMM19001001-V12-10-page52.txt: [("'Tie", 'Tie'), ("'Latest", 'Latest')]
TMM19001101-V12-11-page16.txt: [("'the", 'the')]
TMM19001101-V12-11-page18.txt: [("'professional", 'professional')]
TMM19001101-V12-11-page47.txt: [("'Why", 'Why')]
TMM19001101-V12-11-page51.txt: [("'roof", 'roof'), ("'Brooklyn", 'Brooklyn'), ("'without", 'without'), ("'for", 'for'), ("'under", 'under'), ("'Venial..", 'Venial..')]
TMM19001101-V12-11-page52.txt: [("'Seventh", 'Seventh'), ("'fine", 'fine'), ("'not", 'not')]
TMM19001101-V12-11-page6.txt: [("'he", 'he'), ("'Cast", 'Cast'), ("'Cast", 'Cast')]
TMM19001201-V12-12-page14.txt: [("'la", 'la')]
TMM19001201-V12-12-page2.txt: [("'UNDERWOOD", 'UNDERWOOD')]
TMM19001201-V12-12-page22.txt: [("'house", 'house')]
TMM19001201-V12-12-page28.txt: [("'events.", 'events.')]
TMM19001201-V12-12-page3.txt: [("'i", 'i')]
TMM19001201-V12-12-page49.txt: [("'Volunteer", 'Volunteer')]
TMM19001201-V12-12-page51.txt: [("'York.", 'York.')]
TMM19001201-V12-12-page52.txt: [("'Tie", 'Tie')]
TMM19001201-V12-12-page9.txt: [("'as", 'as')]
TMM19020101-V14-01-page10.txt: [("'onechapter", 'onechapter')]
TMM19020101-V14-01-page13.txt: [("'neath", 'neath'), ("'Tis", 'Tis'), ("'Tis", 'Tis')]
TMM19020101-V14-01-page2.txt: [("'N", 'N')]
TMM19020101-V14-01-page34.txt: [("'disease.", 'disease.')]
TMM19020201-V14-02-page37.txt: [('\'"', '"')]
TMM19020201-V14-02-page40.txt: [("'verse", 'verse')]
TMM19020201-V14-02-page42.txt: [("'at", 'at')]
TMM19020201-V14-02-page49.txt: [("'Jo", 'Jo')]
TMM19020201-V14-02-page50.txt: [("'act", 'act'), ("'For", 'For')]
TMM19020201-V14-02-page51.txt: [("'stigmatized", 'stigmatized')]
TMM19020201-V14-02-page52.txt: [("'Scenery", 'Scenery')]
TMM19020301-V14-03-page1.txt: [("'The", 'The'), ("'illustrated", 'illustrated'), ("'Japan", 'Japan'), ("'Among", 'Among'), ("'.", '.'), ("'MISSION", 'MISSION'), ("'.", '.'), ("'The", 'The'), ("'.", '.'), ("'Children", 'Children')]
TMM19020301-V14-03-page10.txt: [("'will", 'will')]
TMM19020301-V14-03-page2.txt: [("'ckroutes", 'ckroutes'), ("'.", '.'), ("'it", 'it'), ("'ST", 'ST'), ("'CIMAGO", 'CIMAGO'), ("'State", 'State')]
TMM19020301-V14-03-page25.txt: [("'quarantine", 'quarantine')]
TMM19020301-V14-03-page26.txt: [("'distant.", 'distant.')]
TMM19020301-V14-03-page40.txt: [("'has", 'has')]
TMM19020301-V14-03-page42.txt: [("'diet", 'diet')]
TMM19020301-V14-03-page44.txt: [("'and", 'and')]
TMM19020301-V14-03-page8.txt: [("'patient.", 'patient.')]
TMM19020401-V14-04-page12.txt: [("'effigy", 'effigy')]
TMM19020401-V14-04-page15.txt: [("'Tis", 'Tis')]
TMM19020401-V14-04-page19.txt: [("'the", 'the')]
TMM19020401-V14-04-page39.txt: [("'Therefore", 'Therefore')]
TMM19020401-V14-04-page46.txt: [("'down", 'down')]
TMM19020401-V14-04-page51.txt: [("'Tremont", 'Tremont')]
TMM19020401-V14-04-page52.txt: [("'OW", 'OW')]
TMM19020501-V14-05-page14.txt: [("'had", 'had')]
TMM19020501-V14-05-page2.txt: [("'CABINET", 'CABINET')]
TMM19020501-V14-05-page24.txt: [("'from", 'from')]
TMM19020501-V14-05-page31.txt: [("'But", 'But')]
TMM19020501-V14-05-page32.txt: [("'only", 'only')]
TMM19020501-V14-05-page37.txt: [("'come", 'come')]
TMM19020501-V14-05-page51.txt: [("'.", '.'), ("'....", '....'), ("'details", 'details')]
TMM19020501-V14-05-page6.txt: [("'love.", 'love.')]
In [36]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction4 Average verified rate: 0.9845119517865869 Average of error rates: 0.022646753246753245 Total token count: 861503
In [37]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[37]:
[('e', 485),
('w', 476),
("'", 466),
('t', 344),
('m', 343),
('r', 313),
('d', 303),
('n', 302),
('f', 271),
('g', 254),
('th', 110),
('x', 76),
('co', 70),
('k', 68),
('pa', 64),
('u', 64),
('z', 63),
('mis', 42),
('io', 42),
('oc', 40),
('oo', 33),
('cc', 31),
('sionary', 29),
('re', 25),
('al', 23),
('q', 22),
('mt', 20),
('hausaland', 19),
('id', 19),
('ft', 19),
('stauffer', 19),
('ary', 19),
('basle', 18),
('zo', 18),
('mo', 18),
('couva', 17),
('kalaka', 17),
('hasegawa', 17),
('sul', 17),
('okohira', 16),
('ro', 16),
('pp', 15),
('helsingfors', 15),
('sabbathschool', 15),
("hours'", 15),
('te', 15),
('schwantes', 15),
('raiatea', 15),
('wm', 15),
('ioo', 14)]
Correction 5 -- Rejoin Split Words¶
In [39]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction5"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980101-V10-01-page1.txt: [('Mis', 'SIONARY')]
TMM18980101-V10-01-page31.txt: [('ro', 'of')]
TMM18980101-V10-01-page4.txt: [('fil', 'A')]
TMM18980201-V10-02-page37.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18980201-V10-02-page38.txt: [('zo', 'o')]
TMM18980201-V10-02-page39.txt: [('SECRE', 'TARIES')]
TMM18980201-V10-02-page9.txt: [('Ning', 'po')]
TMM18980301-V10-03-page24.txt: [('Mis', 'SIONARY')]
TMM18980301-V10-03-page25.txt: [('es', 'd'), ('mi', 'o'), ('por', 'no'), ('Aqui', 'no')]
TMM18980301-V10-03-page39.txt: [('Mis', 'SION')]
TMM18980301-V10-03-page6.txt: [('Fi', 'le')]
TMM18980301-V10-03-page8.txt: [('Mc', 'Carthy')]
TMM18980401-V10-04-page26.txt: [('HISTOR', 'ICAL')]
TMM18980401-V10-04-page3.txt: [('G.', '')]
TMM18980401-V10-04-page36.txt: [('pais', 'a')]
TMM18980401-V10-04-page7.txt: [("KING'", 'S')]
TMM18980501-V10-05-page33.txt: [("God'", 's')]
TMM18980501-V10-05-page35.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18980501-V10-05-page39.txt: [('re', 'leased')]
TMM18980601-V10-06-page16.txt: [('sr', 'A')]
TMM18980801-V10-08-page35.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18980801-V10-08-page38.txt: [('MIS', 'SIONARY')]
TMM18980901-V10-09-page17.txt: [('wh', 'en')]
TMM18981001-V10-10-page28.txt: [('Previo', 'us')]
TMM18981101-V10-11-page17.txt: [('MIS', 'SIONARY')]
TMM18981101-V10-11-page20.txt: [('Tien', 'Tsin')]
TMM18981101-V10-11-page25.txt: [('MAGA', 'ZINE')]
TMM18981101-V10-11-page36.txt: [('SOCI', 'ETY')]
TMM18981101-V10-11-page37.txt: [('MAGA', 'ZINE')]
TMM18981101-V10-11-page38.txt: [('MAGA', 'ZINE')]
TMM18981201-V10-12-page41.txt: [('wa', 'y'), ('MC', 'CARTHY')]
TMM18990101-V11-01-page14.txt: [('es', 'Pecially')]
TMM18990101-V11-01-page29.txt: [('MIS', 'SIONARY')]
TMM18990101-V11-01-page47.txt: [('PHILADEL', 'PHIA')]
TMM18990101-V11-01-page48.txt: [('repre', 'sentative')]
TMM18990201-V11-02-page14.txt: [("Angel'", 's')]
TMM18990201-V11-02-page52.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18990201-V11-02-page54.txt: [('PHILADEL', 'PHIA')]
TMM18990301-V11-03-page11.txt: [('tri', 'weekly')]
TMM18990301-V11-03-page34.txt: [('G.', '')]
TMM18990301-V11-03-page38.txt: [('PHILADEL', 'PHIA')]
TMM18990401-V11-04-page36.txt: [('th', 'or')]
TMM18990401-V11-04-page38.txt: [('PHILADEL', 'PHIA'), ('Indo', 'China')]
TMM18990401-V11-04-page39.txt: [('repre', 'sentative')]
TMM18990501-V11-05-page42.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18990501-V11-05-page46.txt: [('PHILADEL', 'PHIA')]
TMM18990601-V11-06-page46.txt: [('PHILADEL', 'PHIA')]
TMM18990701-V11-07-page26.txt: [('civiliz', 'ation')]
TMM18990701-V11-07-page27.txt: [('so-', 'called')]
TMM18990701-V11-07-page41.txt: [('longsuffer', 'ing')]
TMM18990701-V11-07-page46.txt: [('MIS', 'SIONARY')]
TMM18990801-V11-08-page11.txt: [('PC', 't'), ('al', 'i'), ('CA', 'W'), ('RI', 'a'), ('TE', 'R'), ('ato', 'N'), ('re', 'C'), ('ma', 'I'), ('te', 'a'), ('tAl', 'a'), ('JU', 'N'), ('Ele', 'a'), ('EV', 'I'), ('CI', 'T')]
TMM18990801-V11-08-page22.txt: [('IL', 'A')]
TMM18990801-V11-08-page34.txt: [('Philadel', 'phia')]
TMM18990801-V11-08-page45.txt: [('MIS', 'SION')]
TMM18990801-V11-08-page46.txt: [('MIS', 'SIONARY')]
TMM18990901-V11-09-page34.txt: [('co', 'laborer')]
TMM18990901-V11-09-page35.txt: [('mis', 'sionary')]
TMM18990901-V11-09-page46.txt: [('MIS', 'SIONARY')]
TMM18991001-V11-10-page17.txt: [('Mc', 'Carthy')]
TMM18991001-V11-10-page46.txt: [('MIS', 'SIONARY')]
TMM18991101-V11-11-page37.txt: [('MAGA', 'ZINE')]
TMM18991101-V11-11-page42.txt: [('MIS', 'SION')]
TMM18991101-V11-11-page44.txt: [('Indo', 'China')]
TMM18991101-V11-11-page46.txt: [('MIS', 'SIONARY'), ('Superin', 'tendent')]
TMM18991201-V11-12-page21.txt: [('MC', 'CARTHY')]
TMM18991201-V11-12-page31.txt: [('th', 'at')]
TMM18991201-V11-12-page38.txt: [('RE', 'V')]
TMM18991201-V11-12-page40.txt: [('Nebuchadn', "ezzar's")]
TMM18991201-V11-12-page46.txt: [('MIS', 'SIONARY')]
TMM19000101-V12-01-page10.txt: [('re', 'a')]
TMM19000101-V12-01-page37.txt: [('MAGA', 'ZINE')]
TMM19000101-V12-01-page44.txt: [('MIS', 'SIONARY')]
TMM19000101-V12-01-page47.txt: [('MAGA', 'ZINE')]
TMM19000101-V12-01-page50.txt: [('MIS', 'SIONARY'), ('MAGA', 'ZINE')]
TMM19000101-V12-01-page51.txt: [('reci', 'pes')]
TMM19000101-V12-01-page6.txt: [('un', 'INTELLIGENT')]
TMM19000201-V12-02-page36.txt: [('corre', 'sponding')]
TMM19000201-V12-02-page40.txt: [('Indo', 'China'), ('EM', 'It')]
TMM19000201-V12-02-page46.txt: [('MIS', 'SION')]
TMM19000201-V12-02-page50.txt: [('MIS', 'SIONARY')]
TMM19000201-V12-02-page51.txt: [('TRE', 'S')]
TMM19000201-V12-02-page52.txt: [('ig', 'n')]
TMM19000301-V12-03-page39.txt: [('pre', 'arrangement')]
TMM19000301-V12-03-page46.txt: [('ti', 'to')]
TMM19000401-V12-04-page50.txt: [('MIS', 'SIONARY')]
TMM19000401-V12-04-page51.txt: [('WA', 'RDS')]
TMM19000501-V12-05-page18.txt: [('CA', 'VINESS')]
TMM19000501-V12-05-page22.txt: [('re', 'no')]
TMM19000501-V12-05-page45.txt: [('MIS', 'SION')]
TMM19000501-V12-05-page50.txt: [('MIS', 'SIONARY')]
TMM19000501-V12-05-page52.txt: [('Li', 'Q')]
TMM19000601-V12-06-page21.txt: [('Ju', 'n')]
TMM19000601-V12-06-page45.txt: [('RE', 'VIEW')]
TMM19000601-V12-06-page49.txt: [('infor', 'mation')]
TMM19000601-V12-06-page50.txt: [('MIS', 'SIONARY')]
TMM19000701-V12-07-page40.txt: [('exac', 'test')]
TMM19000701-V12-07-page50.txt: [('MIS', 'SIONARY'), ('regula', 'rly')]
TMM19000701-V12-07-page52.txt: [('SIMPLICIT', 'Y')]
TMM19000801-V12-08-page12.txt: [('wh', 'o')]
TMM19000801-V12-08-page44.txt: [('MIS', 'SION')]
TMM19000801-V12-08-page45.txt: [('MA', 'TABELE')]
TMM19000801-V12-08-page47.txt: [('Tien', 'Tsin')]
TMM19000801-V12-08-page50.txt: [('MIS', 'SIONARY')]
TMM19000801-V12-08-page51.txt: [('VESTIBU', 'LED')]
TMM19000801-V12-08-page52.txt: [('SIMPLIC', 'ITY')]
TMM19000901-V12-09-page32.txt: [('MC', 'CARTHY')]
TMM19000901-V12-09-page5.txt: [('MC', 'CARTHY')]
TMM19000901-V12-09-page50.txt: [('MIS', 'SIONARY')]
TMM19001001-V12-10-page20.txt: [('MC', 'CARTHY')]
TMM19001001-V12-10-page38.txt: [('studen', 'ts')]
TMM19001001-V12-10-page44.txt: [('re', 'ct')]
TMM19001001-V12-10-page50.txt: [('MIS', 'SIONARY')]
TMM19001101-V12-11-page28.txt: [('MC', 'CARTHY')]
TMM19001101-V12-11-page44.txt: [('MIS', 'SION')]
TMM19001101-V12-11-page45.txt: [('RARATONG', 'A')]
TMM19001101-V12-11-page47.txt: [('fel', 'lows')]
TMM19001101-V12-11-page50.txt: [('MIS', 'SIONARY')]
TMM19001101-V12-11-page51.txt: [('BEW', 'ARE'), ('re', 'Price')]
TMM19001201-V12-12-page11.txt: [('magnif', 'icent')]
TMM19001201-V12-12-page18.txt: [('MC', 'CARTHY')]
TMM19001201-V12-12-page2.txt: [('co', 'mpany')]
TMM19001201-V12-12-page31.txt: [('wa', 's')]
TMM19001201-V12-12-page35.txt: [('re', 'reading')]
TMM19001201-V12-12-page47.txt: [('Ro', 'man')]
TMM19001201-V12-12-page50.txt: [('MIS', 'SIONARY')]
TMM19020101-V14-01-page43.txt: [('Guadalaj', 'ara')]
TMM19020201-V14-02-page17.txt: [('unf', 'allen')]
TMM19020201-V14-02-page33.txt: [('th', 'or')]
TMM19020201-V14-02-page48.txt: [('ASSO', 'CIATION')]
TMM19020201-V14-02-page49.txt: [('LOUISI', 'ANA'), ('NE', 'BRASKA')]
TMM19020201-V14-02-page8.txt: [('Tien', 'Tsin')]
TMM19020301-V14-03-page1.txt: [('EDITORI', 'AL')]
TMM19020301-V14-03-page16.txt: [('MC', 'CARTHY')]
TMM19020301-V14-03-page26.txt: [('th', 'o')]
TMM19020401-V14-04-page16.txt: [('pA', 'L')]
TMM19020401-V14-04-page47.txt: [('Sul', 'a')]
TMM19020401-V14-04-page52.txt: [('ma', 'M')]
TMM19020501-V14-05-page1.txt: [('TH', 'E')]
TMM19020501-V14-05-page11.txt: [('suc', 'cess')]
TMM19020501-V14-05-page2.txt: [('GA', 'g')]
TMM19020501-V14-05-page48.txt: [('oc', 'H'), ('BENEV', 'OLENT')]
TMM19020501-V14-05-page7.txt: [('MAGA', 'ZINE')]
In [42]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction5 Average verified rate: 0.984760022894214 Average of error rates: 0.022267532467532464 Total token count: 861353
In [43]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[43]:
[('e', 484),
('w', 476),
("'", 466),
('m', 342),
('t', 342),
('r', 312),
('d', 302),
('n', 300),
('f', 271),
('g', 254),
('th', 107),
('x', 76),
('co', 69),
('k', 68),
('u', 64),
('pa', 63),
('z', 63),
('io', 42),
('oc', 40),
('oo', 33),
('cc', 31),
('al', 21),
('q', 21),
('mt', 20),
('re', 20),
('hausaland', 19),
('id', 19),
('ary', 19),
('ft', 19),
('stauffer', 19),
('zo', 18),
('basle', 18),
('mo', 18),
('couva', 17),
('kalaka', 17),
('hasegawa', 17),
('sul', 17),
('okohira', 16),
('helsingfors', 15),
('pp', 15),
('sabbathschool', 15),
("hours'", 15),
('schwantes', 15),
('raiatea', 15),
('wm', 15),
('ro', 15),
('ioo', 14),
('seventhday', 14),
('ic', 14),
('te', 14)]
Correction 6 -- Rejoin Split Words II¶
In [45]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980201-V10-02-page37.txt: [('V', 'ermont')]
TMM18980301-V10-03-page25.txt: [('No', 'se')]
TMM18980301-V10-03-page39.txt: [('MisSION', 'ARY')]
TMM18980501-V10-05-page33.txt: [('whole', 'heartedness')]
TMM18980601-V10-06-page16.txt: [('r', 'OW'), ('the', 'Re')]
TMM18980901-V10-09-page5.txt: [('sub', 'terranean')]
TMM18981001-V10-10-page28.txt: [('w', 'ork')]
TMM18981201-V10-12-page42.txt: [('u', 'tA')]
TMM18981201-V10-12-page45.txt: [('Character', 'istic')]
TMM18990101-V11-01-page14.txt: [('es', 'Pecially')]
TMM18990101-V11-01-page47.txt: [('MISSION', 'ARY')]
TMM18990101-V11-01-page48.txt: [('repre', 'sentative')]
TMM18990201-V11-02-page45.txt: [('o', 'ffer')]
TMM18990201-V11-02-page54.txt: [('MISSION', 'ARY')]
TMM18990301-V11-03-page34.txt: [('Num', 'ber')]
TMM18990301-V11-03-page38.txt: [('MISSION', 'ARY')]
TMM18990301-V11-03-page9.txt: [('r', 'oth')]
TMM18990401-V11-04-page38.txt: [('MISSION', 'ARY')]
TMM18990401-V11-04-page39.txt: [('repre', 'sentative')]
TMM18990501-V11-05-page22.txt: [('req', 'uirement')]
TMM18990501-V11-05-page26.txt: [('in', 'ti')]
TMM18990501-V11-05-page46.txt: [('MISSION', 'ARY')]
TMM18990601-V11-06-page46.txt: [('MISSION', 'ARY')]
TMM18990701-V11-07-page17.txt: [('a', 'ny')]
TMM18990701-V11-07-page5.txt: [('gov', "ernor's")]
TMM18990801-V11-08-page11.txt: [('a', 'tt'), ('t', 'il'), ('s', 'gt'), ('at', 'co'), ('a', 'te'), ('a', 'RIZ')]
TMM18990801-V11-08-page20.txt: [('p', 'ork')]
TMM18990901-V11-09-page3.txt: [('MISSION', 'ARY')]
TMM18991001-V11-10-page37.txt: [('a', 're')]
TMM18991001-V11-10-page46.txt: [('an', 'swers')]
TMM18991201-V11-12-page29.txt: [('ha', 've')]
TMM18991201-V11-12-page40.txt: [('Nebuchadn', "ezzar's")]
TMM19000101-V12-01-page43.txt: [('a', 'nd')]
TMM19000101-V12-01-page47.txt: [('MISSION', 'ARY')]
TMM19000101-V12-01-page52.txt: [('DEVELOP', 'MENT')]
TMM19000201-V12-02-page29.txt: [('a', 'nd')]
TMM19000201-V12-02-page36.txt: [('corre', 'sponding')]
TMM19000201-V12-02-page49.txt: [('Miss', 'IONARY')]
TMM19000201-V12-02-page51.txt: [('Mission', 'arY')]
TMM19000301-V12-03-page42.txt: [('my', 'thology')]
TMM19000301-V12-03-page45.txt: [('hard', 'ly')]
TMM19000301-V12-03-page8.txt: [('the', 'Ta')]
TMM19000401-V12-04-page25.txt: [('car', 'ried')]
TMM19000501-V12-05-page32.txt: [('MISSION', 'ARY')]
TMM19000501-V12-05-page33.txt: [('wonder', 'ful')]
TMM19000501-V12-05-page5.txt: [('second', 'ary')]
TMM19000601-V12-06-page10.txt: [('a', 'li')]
TMM19000601-V12-06-page11.txt: [('be', 'ng')]
TMM19000601-V12-06-page52.txt: [('the', 'Remin')]
TMM19000801-V12-08-page34.txt: [('to', 'Shiba')]
TMM19000801-V12-08-page5.txt: [('cent', 'uries')]
TMM19000901-V12-09-page51.txt: [('con', 'nection')]
TMM19001001-V12-10-page31.txt: [('a', 'nd')]
TMM19001001-V12-10-page44.txt: [('re', 'ct'), ('a', 're')]
TMM19001001-V12-10-page52.txt: [('A', 'IL')]
TMM19001001-V12-10-page6.txt: [('Am', 'alekites')]
TMM19001001-V12-10-page8.txt: [('MISSION', 'ARY')]
TMM19001101-V12-11-page51.txt: [('P', 'hiladelphia')]
TMM19001201-V12-12-page2.txt: [('co', 'mpany')]
TMM19001201-V12-12-page23.txt: [('k', 'eeping')]
TMM19020201-V14-02-page21.txt: [('MISSION', 'ARY')]
TMM19020201-V14-02-page31.txt: [('Aguas', 'Calientes')]
TMM19020201-V14-02-page48.txt: [('Bell', 'oc')]
TMM19020201-V14-02-page52.txt: [('CEN', 'TRAL'), ('R', 'IP')]
TMM19020301-V14-03-page2.txt: [('B', 'RA')]
TMM19020301-V14-03-page3.txt: [('CEN', 'TRAL')]
TMM19020301-V14-03-page34.txt: [('con', 'verts')]
TMM19020401-V14-04-page3.txt: [('CEN', 'TRAL')]
TMM19020501-V14-05-page17.txt: [('the', 'mis')]
TMM19020501-V14-05-page2.txt: [('a', 'GA')]
TMM19020501-V14-05-page3.txt: [('CEN', 'TRAL'), ('E', 'xcursion')]
TMM19020501-V14-05-page8.txt: [('pro', 'tection')]
In [48]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction6 Average verified rate: 0.9848273395829028 Average of error rates: 0.02219012987012987 Total token count: 861286
In [49]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[49]:
[('e', 484),
('w', 475),
("'", 466),
('m', 342),
('t', 342),
('r', 310),
('d', 302),
('n', 300),
('f', 271),
('g', 254),
('th', 107),
('x', 76),
('co', 68),
('k', 67),
('u', 64),
('pa', 63),
('z', 63),
('io', 42),
('oc', 40),
('oo', 33),
('cc', 31),
('al', 21),
('q', 21),
('mt', 20),
('hausaland', 19),
('id', 19),
('ft', 19),
('stauffer', 19),
('zo', 18),
('basle', 18),
('mo', 18),
('couva', 17),
('kalaka', 17),
('hasegawa', 17),
('sul', 17),
('re', 17),
('okohira', 16),
('helsingfors', 15),
('pp', 15),
('sabbathschool', 15),
("hours'", 15),
('schwantes', 15),
('raiatea', 15),
('wm', 15),
('ro', 15),
('ioo', 14),
('seventhday', 14),
('ic', 14),
("''", 13),
('te', 13)]
Review Remaining Errors¶
In [50]:
reports.docs_with_high_error_rate(summary)
Out[50]:
[('TMM18980701-V10-07-page42.txt', 1.0),
('TMM18990201-V11-02-page10.txt', 0.9),
('TMM19000301-V12-03-page9.txt', 0.614),
('TMM18980101-V10-01-page4.txt', 0.605),
('TMM18991101-V11-11-page23.txt', 0.534),
('TMM18980301-V10-03-page25.txt', 0.517),
('TMM18990801-V11-08-page11.txt', 0.512),
('TMM18980401-V10-04-page4.txt', 0.5),
('TMM18990301-V11-03-page17.txt', 0.5),
('TMM18990701-V11-07-page10.txt', 0.5),
('TMM18990401-V11-04-page4.txt', 0.5),
('TMM18980301-V10-03-page6.txt', 0.449),
('TMM18980601-V10-06-page20.txt', 0.448),
('TMM19020301-V14-03-page2.txt', 0.363),
('TMM18980501-V10-05-page28.txt', 0.341),
('TMM18980301-V10-03-page18.txt', 0.333),
('TMM18980301-V10-03-page10.txt', 0.333),
('TMM18990501-V11-05-page48.txt', 0.333),
('TMM18990801-V11-08-page48.txt', 0.321),
('TMM19001001-V12-10-page1.txt', 0.317),
('TMM19000601-V12-06-page48.txt', 0.302),
('TMM19000101-V12-01-page10.txt', 0.291),
('TMM18990101-V11-01-page48.txt', 0.263),
('TMM18981201-V10-12-page20.txt', 0.25),
('TMM19000501-V12-05-page4.txt', 0.25),
('TMM18990601-V11-06-page48.txt', 0.25),
('TMM18980701-V10-07-page38.txt', 0.235),
('TMM18980201-V10-02-page13.txt', 0.219),
('TMM19020501-V14-05-page51.txt', 0.213)]
In [52]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.3]
utilities.open_original_docs(doc_keys, directories['cycle'])
Opened files: TMM18980701-V10-07-page42.txt TMM18990201-V11-02-page10.txt TMM19000301-V12-03-page9.txt TMM18980101-V10-01-page4.txt TMM18991101-V11-11-page23.txt TMM18980301-V10-03-page25.txt TMM18990801-V11-08-page11.txt TMM18980401-V10-04-page4.txt TMM18990301-V11-03-page17.txt TMM18990701-V11-07-page10.txt TMM18990401-V11-04-page4.txt TMM18980301-V10-03-page6.txt TMM18980601-V10-06-page20.txt TMM19020301-V14-03-page2.txt TMM18980501-V10-05-page28.txt TMM18980301-V10-03-page18.txt TMM18980301-V10-03-page10.txt TMM18990501-V11-05-page48.txt TMM18990801-V11-08-page48.txt TMM19001001-V12-10-page1.txt TMM19000601-V12-06-page48.txt
Most of the high error documents match the usual pattern of maps, images, and charts. One interesting exception is "TMM18980301-V10-03-page25.txt", which is in Spanish. I examined the original OCR and there were no accent marks that were lost during normalizing.
In [55]:
reports.long_errors(errors_summary, min_length=15)
Out[55]:
(['austria-hungaria', 'scripture-sabbath', 'gospel-commission', 'elevatedrailroad', 'newly-established', 'darjeeling-above', 'spanish-speaking', 'soul-and-body-destroying', 'into-insignificance', 'heaven-descended', 'hastily-organized', 'greatgrandparents', "'globetrottings'", 'hard-heartedness', 'stivkimikarkaaagiaiwatkaaiiiikiiiikit', 'scene-guadalajara', 'self-commendation', 'apoitleshipbeitring', 'interestinebible', 'fourteenyear-old', 'mamouret-ul-aziz', 'nezdterrerwiethe', 'cigarette-papers', 'pylrlitigeltrlile', 'self-denyingfollowers', 'joinherinherlabors', 'artificially-made', "controversy'''among", 'out-stations--one', 'long-experienced', 'intelligent-looking', 'milkailkiiticimallikillitcattikilit', 'spanish-american', 'otherispanish-speaking', 'charity-begins-at-home', 'self-complacency', "looks'upon-their", 'inexactconformity', 'self-aggrandizement', 'daughters-in-law', 'fourthsabbathexercise', 'sonderburg-glucksburg', 'christianfarmers', 'waterloo-jamaica', 'thickly-timbered', 'fire-worshippers', 'frontispiece-mamma', 'blood-corpuscles', 'literally-fulfiled', 'erichermerchantsintheusualway', 'kaailikarkalikaii', 'nrinfimparirlittlawawlit', 'appropropriately', 'innocent-looking', 'vapolreidzeerrewniteh', 'inspector-general', 'chinese-japanese', 'accountabilities', 'tamtatikivnityleysa', 'pricedinitrzements', 'pleasure-seekers', "sabbath-keeper's", 'lifrimmiiimenspirmiivinfillir', 'governorgenerals', 'ersacizaznovovar', 'ereationaniagara', 'trial--freeesendme', 'printing-presses', 'sasiiiiisnamiximinbegpeemnize', 'avcitosivivocktickpeptv', 'calvary-redeemed', 'rifirmtairiiiitliww', 'self-forgetfulness', 'civilizationexists', 'nieswaynorkadvocateofworld', 'more-than-one-halffinger-long', 'ronoliilgichinese', 'buildingresembling', 'ofwhose-pronunciation', 'tlimmoutrlosillummultrm', 'christ-followers', 'orikakipkokyikartikawaavaikaaiiikaiiikaitio', 'lengthandquality', 'fircaraitisttiattkiisikaikaikatwatekattit', 'sp-anish-speaking', 'doppelschraubenpostdampfer', 'panama-hat-is-aproduetion-of', 'christianity-its', 'narrow-mindedness', 'ilhaillimillkillitilliiraliirailhallikakiiilkilitilikiiilki', 'intienregstdxperience', 'semi-independent', 'sugar-plantation', 'self-propagation', 'buluwayo-zambesia', 'swedish-speaking', 'simple-mindedness', 'anti-progressive', 'rotilezdtftervtee', 'christianizingthe', 'frontispiece-harbor', 'attentiontshould', 'infaluableutenaill', 'broad-shouldered', 'pcmammariscloist', 'italian-speaking', 'health-restoring', 'long-established', 'sixteenprovinces', 'learningsomething', 'mievivimmeirinfirfa', 'recently-developed', 'non-commissioned', 'church-fellowship', 'english-japanese', "stauffer'szletter", 'germanicthoroughlyy', 'doppelsehraubenpostdampfer', 'church-membership', 'anidrgceontfaolutnsd', 'japanese-english', 'commandment-keeping', 'well-proportioned', 'nativity-interior', 'french-switzerland', 'powakikrilifwvfairarlit', 'american-spanish', 'ttttttttttttttttts', 'frontispiece-thatched', 'tfirinfargiiralt', 'subscriptionsshould', 'poorly-furnished', 'self-established', 'alexandria-troas', 'irillillkillrallikillbilibilirrillralillkillullitilllikillp', 'coffee-producing', 'artistically-built', 'boarding-schools', 'fanning-machines', 'amphitheatershaped', 'wthetreyovuaneotrsierpyiyot', 'kindergarten-school', 'fellow-passenger', 'gorgeously-arrayed', 'generalconference', 'hopelessicondition', 'sabbath-breaking', 'ratilikalattidir', 'accommodation-houses', 'fitthstliamascit', 'earthquake-visited', 'ositviivtablebbk', 'interspersedhere', 'brilliantly-lighted', 'fellow-countrymen', 'poverty-stricken', 'ethical-political', 'light-complexioned', 'commander-in-chief', 'chimney-blackened', 'thrashing-machine', 'tkilikarstatatamtitaiwookiiiikakotarkit', 'governor-general', 'anti-footbinding', 'alexandria-troas--had', 'firfinifwillirlrilliirilirlitilitalrerwlik'], 15)
In [ ]: