TMM-OCR-Evaluation-and-Correction
In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt",
"2016-12-07-SDA-place-names.txt",
"2016-12-08-SDA-Vocabulary.txt",
"2017-01-03-place-names.txt",
"2017-02-14-Base-Word-List-SCOWL&KJV.txt",
"2017-02-14-Roman-Numerals.txt",
"2017-03-01-Additional-Approved-Words.txt"
]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "TMM"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)
Baseline¶
In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/baseline Average verified rate: 0.9620184818421186 Average of error rates: 0.048324675324675326 Total token count: 870476
In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 30 )
Out[11]:
[('-', 1111), ("'", 540), ('¥', 474), ('w', 467), ('e', 451), ('ñ', 412), (')', 345), ('m', 331), ('n', 290), ('r', 290), ('t', 285), ('d', 273), ('con-', 259), ('f', 239), ('g', 237), ('re-', 221), ('tion', 198), ('mis-', 160), ('in-', 147), ('*', 145), ('(', 128), ('com-', 117), ('ñthe', 107), ('th', 106), ('be-', 105), ('¡', 91), ('de-', 87), ('sionary', 87), ('mission-', 82), ('ment', 78), ('ex-', 76), ('ary', 70), ('tions', 69), ('co', 68), ('x', 67), ('pa', 63), ('k', 62), ('en-', 61), ('u', 61), ('[illustrated]', 60), ('+', 60), ('per-', 59), ('pro-', 58), ('/', 58), ('z', 56), ('dis-', 53), ('ple', 51), ('peo-', 49), ('(the', 49), ('pre-', 48), ('ers', 47), ('an-', 46), ('un-', 46), ('ad-', 43), ('ence', 42), ('ñwe', 41), ('(illustrated)', 41), ('io', 41), ('oc', 40), ('_', 40), ('ber', 39), ('inter-', 39), ('ã', 39), ('ac-', 38), ('for-', 38), ('to-', 37), ('meet-', 37), ('im-', 36), ('thou-', 35), ('can-', 34), (']', 33), ('ña', 32), ('oo', 32), ('mes-', 31), ('al-', 31)]
Check Special Character Use¶
In [12]:
reports.tokens_with_special_characters(errors_summary)
Out[12]:
[('¥', 474), ('ñ', 412), (')', 345), ('*', 145), ('(', 128), ('ñthe', 107), ('¡', 91), ('+', 60), ('[illustrated]', 60), ('/', 58), ('(the', 49), ('ñwe', 41), ('(illustrated)', 41), ('_', 40), ('ã', 39), (']', 33), ('ña', 32), ('ñall', 29), ('`', 27), ('(b)', 26), ('ñin', 26), ('(c)', 26), ('(a)', 26), ('(d)', 23), ('=', 23), ('ñit', 22), ('ñsubscriptions', 22), ('%', 22), ('ñelder', 21), ('departmentñ', 21), ('(a', 21), ('(march)', 20), ('ô', 19), ('¥¥', 19), ('(to', 19), ('ñan', 18), ('(see', 17), ('(for', 17), ('(april)', 16), ('(which', 16), ('ñand', 16), ('(e)', 16), ('(may)', 15), ('ñone', 15), ('\\', 15), ('(poem)', 14), ('£', 14), ('(and', 14), ('(including', 13), ('(as', 13), ('(in', 13), ('[see', 13), ('ñno', 12), ('(f)', 12), ('¥¥¥', 12), ('ñto', 12), ('holland)', 12), ('ñid', 12), ('(incorporated', 12), ('(concluded', 11), ('ñtest', 11), ('>', 11), ('(or', 11), ('[poem]', 10), ('(continued', 10), ('+contents+', 10), ('ñmrs', 10), ('ñjune', 9), ('(acts', 8), ('(j', 8), ('ñjuly', 8), ('ñbrother', 8), ('ñi', 8), ('(january)', 8), ('¥¥¥¥¥¥¥¥', 7), ('ñhe', 7), ('(fourth', 7), ('[second', 7), ('ñfrom', 7), ('(june)', 7), ('christianityñits', 7), ('ñour', 7), ('ñmay', 7), ('<', 7), ('-¥', 7), ('ñspecial', 7), ('ñfebruary', 6), ('ñjohn', 6), ('(august)', 6), ('(july)', 6), ('ñwhen', 6), ('reading)', 6), ('[fourth', 6), ('ñfacts', 6), ('ñjust', 6), ('ñapril', 6), ('(i', 6), ('reading]', 6), ('(december', 6), ('ñdr', 6), ('the¥', 6), ('ñmarch', 5), ("'¥", 5), ('(if', 5), ('ñnot', 5), ('û', 5), ('(dan', 5), ('ñsince', 5), ('\ufeff', 5), ('%x', 5), ('ñseptember', 5), ('ñhome', 5), ('them)', 5), ('[', 5), ('world)', 5), ('++', 5), ('readingñsabbath', 5), ('it)', 5), ('(there', 5), ('(september)', 5), ('land)', 5), ('(rom', 5), ('~~', 5), ('(i)', 5), ('(from', 5), ('ñthat', 5), ('(g)', 4), ('(april', 4), ('cenñ', 4), ('[the', 4), ('{', 4), ('f\x90te-day', 4), ('ñrev', 4), ('ñbecause', 4), ('`i', 4), ('(rev', 4), ('ñmr', 4), ('(march', 4), ('(work', 4), ('(with', 4), ('school)', 4), ('ñdecember', 4), ('service]', 4), ('people)', 4), ('(of', 4), ('(verse', 4), ('ñon', 4), ('(oregon)', 4), ('darjeelingñabove', 4), ('¥¥¥¥', 4), ('ñheb', 4), ("¥'", 4), ('ñprofessor', 4), ('river)', 4), ('ñabout', 4), ('ñmatt', 4), ('ñduring', 4), ('ñjanuary', 4), ('(isa', 4), ('(february', 4), ('weeks)', 3), ('¥¥¥¥¥', 3), ('(building', 3), ('natives)', 3), ('_-', 3), ('mission)', 3), ('workñit', 3), ('(nearly', 3), ('lapps)', 3), ('year)', 3), ('¥v', 3), ('^', 3), ('ñevery', 3), ('a¥nd', 3), ('o)', 3), ('(that', 3), ('workñ', 3), ('ñmissionary', 3), ('(r', 3), ('water)', 3), ('))', 3), ('called)', 3), ('*the', 3), ('organ)', 3), ('-(', 3), ('}', 3), ('cooked)', 3), ('(it', 3), ('i)', 3), ('ñsel', 3), ('oolooberiaña', 3), ('[*', 3), ('ñas', 3), ('themñand', 3), ('ñoctober', 3), ('ñlast', 3), ('(called', 3), ('fund)', 3), ('¥-¥', 3), ('¥i', 3), ('may)', 3), ('(not', 3), ('ñisa', 3), ('(one', 3), ('ñs', 3), ('ñat', 3), ('ñthis', 3), ('(denmark', 3), ('ñpart', 3), ('exercise]', 3), ('(december)', 3), ('watchwordñ', 3), ('ñyes', 3), ('ñthere', 3), ('ñfor', 3), ('(john', 3), ('¥t', 3), ('are)', 3), ('±', 3), ('[illustrated', 3), ('brazilñ', 3), ('(kansas)', 2), ('(ind', 2), ('(most', 2), ('✓', 2), ('(meaning', 2), ('ñtwo', 2), ('r)', 2), ('ñgospel', 2), ('package)', 2), ('//', 2), ('ñspurgeon', 2), ('ñnew', 2), ('(six', 2), ('ago)', 2), ('ñgod', 2), ("+'", 2), ('departme\\t', 2), ('oneñthat', 2), ('ñselected', 2), ('s/', 2), ('_a', 2), ('ñso', 2), ('here)', 2), ('(though', 2), ('ö', 2), ('g%', 2), ('¥¥¥¥¥¥¥¥¥¥', 2), ('(about', 2), ('(ad', 2), ('(mark', 2), ('ruary)', 2), ('humanityñto', 2), ('spirit)', 2), ('¥the', 2), ('stãpittsburg', 2), ("¡'", 2), ('(f', 2), ('daughter)', 2), ('f\x90te', 2), ('missio\\ary', 2), ('(religious', 2), ('hallña', 2), ('only)', 2), ('(all', 2), ('ñyea', 2), ('ñletters', 2), ('/t', 2), ('¥s', 2), ('ñseveral', 2), ('christñthe', 2), ('*ghest', 2), ('(dutch', 2), ('troubleñwhether', 2), (')v', 2), ('`<', 2), ('ñother', 2), ('day)', 2), ('ñbut', 2), ('¥%', 2), ('times)', 2), ('(t', 2), ('ñr', 2), ('children)', 2), ('raceñthe', 2), ('(february)', 2), ('ioo¡', 2), ('cñtwin', 2), ('(christ)', 2), ('worldñis', 2), ('(trinidad)', 2), ('ñword', 2), ('week)', 2), ('(in-', 2), ('ñthey', 2), ('ñdesire', 2), ('ç', 2), ('colony)', 2), ('plata)', 2), ('(margin)', 2), ('time)', 2), ('gospelñthe', 2), ('the_', 2), ('(now', 2), ('church)', 2), ('¥new', 2), ('old)', 2), ('ñsome', 2), ('#', 2), ('ñwith', 2), ('english)', 2), ('ñchristian', 2), ('(but', 2), ('(southern)', 2), ('(denmark)', 2), ('t*', 2), ('(on', 2), ('states)', 2), ('ñof', 2), ('town)', 2), ('caf\x8es', 2), ('¥e', 2), ('churchñthe', 2), ('cut)', 2), ('worldñto', 2), ('nomñthe', 2), ('ig*', 2), ('(they', 2), ('ñif', 2), ('(miss', 2), ('ñpsalm', 2), ('april)', 2), ('(feb-', 2), ('exampleñthe', 2), ('ñtee', 2), ('ñerratum', 2), ('verseñ', 2), ('magazine)', 2), ('(january', 2), ('(holy', 2), ('¥well', 2), ('one¥', 2), ('¥he', 2), ('specialñthe', 2), ('fool)', 2), ('peopleñi', 2), ('(concluded)', 2), ('ñphillips', 2), ('feet)', 2), ('ñby', 2), ('(signs', 2), ('ý', 2), ('ñstudent', 2), ('z¥', 2), ('c¥', 2), ('ñis', 2), ('a¡', 2), ('[in', 2), ('ñspiritual', 2), ('parts)', 2), ('citiesñand', 2), ('(verses', 2), ('allñ', 2), ('beñthe', 2), ('-`', 2), ('e¥', 2), ('a¥re', 2), ('(coolies)', 2), ('jews)', 2), ('(an', 2), ('(sometimes', 2), ('ñhis', 2), ('ñluke', 2), ('i¥', 2), ('bay)', 2), ('ñdo', 2), ('cheetstãphiladelphia', 2), ('(generally', 2), ('ñout', 2), ('life)', 2), ('themñthe', 2), ('(thoroughly', 2), (')+', 2), ('erectedñone', 2), ('/-', 2), ('do)', 2), ('our¥', 2), ('(light', 2), ('(revelation', 2), ('godñnot', 2), ('(may', 2), ('[a', 2), ('(gal', 2), ('sayñand', 2), ('ñreview', 2), ('(later', 2), ('(michigan)', 2), ('**', 2), ('days)', 2), ('manña', 2), ('map)', 2), ('man)', 2), ('mallettñdear', 2), ('***', 2), ('``', 2), ('ñmy', 2), ('(god)', 2), ('ary)', 2), ('(alabama)', 2), ('[to', 2), ('(little', 2), ('(fig', 2), ('victoriañbut', 1), ('healthfully/', 1), ('thisñdishonesty', 1), ('(entre', 1), ('first)', 1), ('ñthree', 1), ('importa]', 1), ('partnerñnow', 1), ('_t_h/so', 1), ('ôof', 1), ('ñsucce', 1), ('churchñit', 1), ('menñthe', 1), ('body)', 1), ('(church)', 1), ('englandñto', 1), ('solitudeñthe', 1), ('(yang-tse', 1), ('f¥', 1), ('restñfor', 1), ('presentñperhaps', 1), ('preparationsñnot', 1), ('thingñfor', 1), ('ñaugust', 1), ('examination)', 1), ('possessionsñall', 1), ('*t', 1), ('societyñ', 1), ('yetñis', 1), ('dollarsñnine', 1), ("r'r%", 1), ('[orang', 1), ('kittsñthey', 1), ('text=bookñnovember', 1), ('(leap', 1), ('_enjoys', 1), ('/l', 1), ('millionsñone-third', 1), ('gatherings)', 1), ('cornñmealiesñis', 1), ('v/', 1), ('enciesñgrand', 1), ('power¥', 1), ('weaknessñ', 1), ('(thena', 1), ('winterñall', 1), ('ñlet', 1), ('myselfñduring', 1), ('on(', 1), ('hregardingv/', 1), ('floorsñthough', 1), ('first¥', 1), ('vationñhis', 1), ('encouraged)', 1), ('monstersñthe', 1), ('tonñunexcelled', 1), ('(wakenaam)', 1), ("tea'ã'is", 1), ('usñsend', 1), ('ôtis', 1), ('¥a', 1), ('worldñextends', 1), ('ñmost', 1), ('gu`', 1), ('vaticanñthe', 1), ('(cow', 1), ('*since', 1), ('argentina_', 1), ('ñg', 1), ('lotñit', 1), ('(j)', 1), ('countryñi', 1), ('slavesñcaptives', 1), ('stampsñamounting', 1), ('a#', 1), ('images)', 1), ('roomsñone', 1), ('messageñcaptain', 1), ('(sabbath-', 1), ('bondsñthese', 1), ('lakes)', 1), ('proml_tly', 1), ('saleñor', 1), ('continued)', 1), ('sister¥', 1), ('(helsingfors)', 1), ('(local', 1), ('about¥', 1), ('miiiiim=', 1), ('slightñfrom', 1), ('christiansñwe', 1), ('islesñst', 1), ('king¥', 1), ('w(', 1), ('(lao-tsze', 1), ('amo\\g', 1), ('(o', 1), ('toolñso', 1), ('fraternityñwhen', 1), ('messageña', 1), ('electro=hydropathic', 1), ('womenñcome', 1), ('(jamaica', 1), ('feverñ', 1), ('floor)', 1), ('demandñability', 1), ('knowledgeñthe', 1), ('studyñthe', 1), ('dayñso', 1), ('that)', 1), ('benedictionñelder', 1), ('ministersñall', 1), ('philippinesñbishop', 1), ('rageñall', 1), ('ãit', 1), ('familyñhe', 1), ('ñent', 1), ('continentñafrica', 1), ('this¥', 1), ('libertyñpolit-', 1), ('\\j', 1), ('doctorñassistant', 1), ('(adventist', 1), ('tal)', 1), ('sionaries)', 1), ('`voorlooper', 1), ('sideñand', 1), ('ñseven', 1), ('ñdifferent', 1), ('j¥', 1), ('actsñin', 1), ('ñmaybe', 1), ("'(", 1), ('c)', 1), ('ñsowing', 1), ('`there', 1), ('ground)', 1), ('letterñtwo', 1), ('biscuit)', 1), ('committeeñthat', 1), ('`kc', 1), ('classñthose', 1), ('nursesñwe', 1), ('ñpaul', 1), ('macheteñthe', 1), ('t`', 1), ('ñjesus', 1), ('a-*/**/¥', 1), ("^'cottiteri", 1), ('additionto_abont', 1), ('soulñ', 1), ('viewñabsolutely', 1), ('on/daniel', 1), ('/ft', 1), ('cattleñsecond', 1), ('(only', 1), ('fatallyñthe', 1), ('directionñgo', 1), ('arthur)', 1), ('instructionsñ', 1), ('dampña', 1), ('motherhoodñcannot', 1), ('biographyñthe', 1), ('¥+r', 1), (')l', 1), ('goingñ', 1), ('itñthe', 1), ('insectsñthey', 1), ('orderñthe', 1), ('millionñrussia', 1), ('kilaueañprobably', 1), ('countriesñan', 1), ('(speaking', 1), ('*henever', 1), ('countriesñguaranteed', 1), ('superiorñ', 1), ('=mummy', 1), ('cityñand', 1), ('ã-_', 1), ('¥what', 1), ('message)', 1), ('a^or', 1), ('breadfruitña', 1), ('encouragingñbut', 1), ('fieldsñit', 1), ('winterñalways', 1), ('crosses)', 1), ('mules)', 1), ('ñcanon', 1), ('againñ', 1), ('doneñwhen', 1), ('(at', 1), ('ancestorsñfor', 1), ('(continental', 1), ('boundñby', 1), ("-%'", 1), ('¥interest', 1), ('%-', 1), ('ñanoust', 1), ('¥professor', 1), ('destructionñbecause', 1), ('paper*', 1), ('ñplainly', 1), ('goodñwe', 1), ('australiañstellenbosch', 1), ('the/', 1), ('believers)', 1), ('fel¥', 1), ('car_', 1), ('slavesñslaves', 1), ('kingña', 1), ('`commanded', 1), ('billowsñall', 1), ('-/', 1), ('himñmay', 1), ('(apartment', 1), ('ct)', 1), ('agoutiña', 1), ('(fields)', 1), ('margin)', 1), ('numeralsñthe', 1), ('tiv(ptilst', 1), ('indiañ', 1), ('alaska)', 1), ('lazmig[', 1), ('inches=', 1), ('timeñthat', 1), ('(phil', 1), ('¥but', 1), ('eatñalthough', 1), ('missionary`', 1), ('hzinû', 1), ('(col', 1), ('__', 1), ('himselfñwithout', 1), ('[food]', 1), ('racesñkafir', 1), ('climate)', 1), ('crocodile)', 1), ('ñlaces', 1), ('mapsñno', 1), ('ourselvesñhere', 1), ('`lo', 1), ('dutyñthat', 1), ('weekñdecember', 1), ('stateñbolivia', 1), ('laile¥city', 1), ('exceptions)', 1), ('nameaa„ss❑', 1), ('colorsñthey', 1), ('(caravansary)', 1), ('marriageñher', 1), ('harborñsaid', 1), ('writeñeven', 1), ('¡s', 1), ('months_', 1), ('i¥¥', 1), ('thought)', 1), ('malesñwere', 1), ('w(`', 1), ('oppositionñwere', 1), ('-an/', 1), ('(mich', 1), ('patonñthat', 1), ('heardñsublimer', 1), ('(when', 1), ('ñhaving', 1), ('augustñin', 1), ('formerñthey', 1), ('brownñ', 1), ('hospitableñwilling', 1), ('soldiersñwhat', 1), ('actlt`', 1), ('\\i', 1), ('(kwi)', 1), ('spainñlonged', 1), ('baptizedñone', 1), ('¥sasnoh', 1), ('doneñ', 1), ('^ids', 1), ('seekñgo', 1), ("_masse'", 1), ('missionaryñat', 1), ('asiañtheir', 1), ('familyñthe', 1), ('carriage¥road', 1), ('(after', 1), ('earn=', 1), ('ôô`', 1), ('(freedom', 1), ('tk%', 1), ('certainñ', 1), ('selfñof', 1), ('(very', 1), ('obi-women)', 1), ('iiiiii=viii', 1), ('(their', 1), ('ñeducation', 1), ('(servants)', 1), ('ñbooker', 1), ('text-bookñ', 1), ('ñsmith', 1), ('wellñtime', 1), ('agesñi', 1), ('hulañperformed', 1), ('``yellow', 1), ('<¥', 1), ('countriesñfrance', 1), ('`and', 1), ('cruzñaside', 1), ('ropeñthe', 1), ('especiallyñand', 1), ('groundñperhaps', 1), ("ã'it", 1), ('spanish)', 1), ('*igit', 1), ('neighborsñone', 1), ('ic)', 1), ('theeñpray', 1), ('saved)', 1), ('¡god', 1), ('peaksñpopocatepetl', 1), ('ill_', 1), ('ñtestimonies', 1), ('charcoalñand', 1), ("(')", 1), ('ñgermany', 1), ('policyñhe', 1), ('rabbitñsupposing', 1), ('philip>', 1), ("ñkerr's", 1), ('weekñ', 1), ('possessionñwhere', 1), ('ñten', 1), ('texasñthe', 1), ('¥every', 1), ('desireñright', 1), ('loveñi', 1), ('%¥', 1), ('othersña', 1), ('(naini', 1), ('lifeñunto', 1), ('badñas', 1), ('republic)', 1), ('¥of', 1), ('macheteña', 1), ('(moravian)', 1), ('handsñmore', 1), ('papersñcopies', 1), ('ñdoes', 1), ('usñso', 1), ('fieldsñi', 1), ('stationñthe', 1), ('(mule-drivers)', 1), ('ñjanuanv', 1), ('styleñby', 1), ('christñit', 1), ('m)', 1), ('*presenting', 1), ('planñ', 1), ('babylon)', 1), ('ñmarca', 1), ('tokenñof', 1), ('loveñdie', 1), ('falls)', 1), ('antilles)', 1), ('loebsack)', 1), ('trvtk`t', 1), ('ñjoshua', 1), ('comeñ', 1), ('islandsñ', 1), ('stãboston', 1), ('levuñ', 1), ('¥chicago', 1), ('argentineñwill', 1), ('faithñnot', 1), ('ãtoward', 1), ('farmsñonly', 1), ('california¥', 1), ('nationsñall', 1), ('accommodated)', 1), ('guageñ', 1), ('fast-daysñdays', 1), ('someñthey', 1), ('bationñthe', 1), ('has_not', 1), ('chinañeven', 1), ('(d', 1), ('understandñfor', 1), ('complainingñonly', 1), ('-_-', 1), ('mail)', 1), ('creatureñ', 1), ("'lgl`", 1), ('mother-in-lawñthere', 1), ('¥be', 1), ('part)', 1), ('(zech', 1), ('so/apper', 1), ('«iay', 1), ('quartñmuch', 1), ('(ex', 1), ('ñtan', 1), ('enoughñto', 1), ('a)', 1), ('strangersñbut', 1), ('c+p', 1), ("ta'*", 1), ('destinationñcaravellasñinquire', 1), ('ñwomen', 1), ('worshipñsun', 1), ('iã', 1), ('liveñthey', 1), ('log)', 1), ('mexico)', 1), ('necessaries_', 1), ('olanchoñsavannas', 1), ('matabelelandñ', 1), ('soft`', 1), ('hong()', 1), ('tub*', 1), ('ñmark', 1), ('prisonña', 1), ('companyñ', 1), ('\\ad-', 1), ('chineseñhigh', 1), ('historyñso', 1), ('livingstoneñthe', 1), ('f`', 1), ('kv*mk', 1), ('=-¥', 1), ('classñthe', 1), ('*****', 1), ('coveringñonly', 1), ('=lead', 1), ('strugglesñthat', 1), ('countryñreceived', 1), ('enunciatedñtruths', 1), ('(mr', 1), ('descriptionñthe', 1), ('menñmen', 1), ('aspectñ', 1), ('destinationña', 1), ('rã', 1), ('torch)', 1), ('frontikl**-i', 1), ('thingsñand', 1), ('themñi', 1), ('t)', 1), ('four¥', 1), ('exerciseñnovember', 1), ('placeñtreating', 1), ('usñthat', 1), ('understand)', 1), ('tune)', 1), ('chinese¥', 1), ('salvadorñthe', 1), ('—segari', 1), ('peopleñthere', 1), ('wayñ', 1), ('groundñthat', 1), ('¤elf-governing', 1), ('doorsñthe', 1), ('monthly)', 1), ('_thee', 1), ('beñ', 1), ('(three', 1), ('beliefsñsome', 1), ('furnitureñ', 1), ('gardenñplaces', 1), ('(patience)', 1), ('yards)', 1), ('expression)', 1), ('godñeven', 1), ('denseñ', 1), ('whichñkusaie', 1), ('kindsñand', 1), ('yearly)', 1), ('truthñ', 1), ('coloradoñ', 1), ('ñoun', 1), ('mals)', 1), ('sir)', 1), ('aljna/-', 1), ('seañas', 1), ('gloomñthe', 1), ('itself)', 1), ('eaten)', 1), ('degradingñit', 1), ('*elder', 1), ('g¥', 1), ('ó', 1), ('christian)', 1), ('personal)', 1), ('medicineñand', 1), ('yearñever', 1), ('[prague]', 1), ('laborersñmr', 1), ("\\'i", 1), ('¥¥¥-¥', 1), ('worldñthe', 1), ('leftñwe', 1), ('illiterateñthe', 1), ('(satisfied)', 1), ('ir_', 1), ('v¥a', 1), ('switzerlandñhong', 1), ('jerusalemñthey', 1), ('neckñtonsilitis', 1), ('ñbefore', 1), ('uresñchanges', 1), ('ñeducational', 1), ('(n)', 1), ('(rum)', 1), ('to-o)', 1), ('creoles)', 1), ('blackñabout', 1), ('iû', 1), ('ñam', 1), ('fieldsñwhether', 1), ('lifô', 1), ('(his', 1), ('glad_', 1), ('(rome)', 1), ('floorñand', 1), ('¥in', 1), ('_that', 1), ('(unless', 1), ('aitutakians)', 1), ('pestñthe', 1), ('abroadñthe', 1), ('widowhoodñall', 1), ('switzerlandñbulu-', 1), ('(grave-', 1), ('restore¥', 1), ("(+'", 1), ('a_llc', 1), ('sliogunateñso', 1), ('laborer)', 1), ('wordñto', 1), ('(gospel', 1), ('viveritt/', 1), ('(almost', 1), ('-*', 1), ('(beneath', 1), ('ginñmore', 1), ('wasñjesus', 1), ("`surveying'", 1), ('(hot', 1), ('grythyttehedñnoted', 1), ('taotaisñ', 1), ('sectsñthe', 1), ('journeyñand', 1), ('exerciseñoctober', 1), ('a¥', 1), ('amphitheaterñto', 1), ('milesñmore', 1), ('(iowa)', 1), ('personsñnatives', 1), ('(joppa', 1), ('gospelñat', 1), ('(embraces)', 1), ('settlementñwhich', 1), ('voyageñ', 1), ('pôco', 1), ('operations¥', 1), ('ñreading', 1), ('/tis', 1), ('ñex-', 1), ('sceneryñand', 1), ('worldñif', 1), ('gehenna)', 1), ('natalñgeneva', 1), ('t#tnr=ligt', 1), ('steamerñthree', 1), ('groundñeven', 1), ('(some', 1), ('aristocraticñthe', 1), ('ñcyrus', 1), ('*isi', 1), ('valleyñ', 1), ('ginzañthe', 1), ('[when', 1), ('to-morrowñwhile', 1), ('door)', 1), ('-***', 1), ('truthñlearn', 1), ('journeyñto', 1), ...]
Correction 1 -- Normalize Characters¶
In [14]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
# Substitute for all other dashes
content = re.sub(r"—-—–‑", r"-", content)
# Substitute formatted apostrophe
content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
# Replace all special characters with a space (as these tend to occur at the end of lines)
content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction1 Average verified rate: 0.9689343941867684 Average of error rates: 0.03858441558441559 Total token count: 869740
In [19]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[19]:
[('-', 1170), ("'", 573), ('e', 479), ('w', 475), ('m', 337), ('t', 314), ('r', 307), ('d', 301), ('n', 295), ('f', 268), ('con-', 259), ('g', 250), ('re-', 222), ('tion', 198), ('mis-', 161), ('in-', 149), ('com-', 117), ('th', 109), ('be-', 105), ('de-', 87), ('sionary', 87), ('mission-', 82), ('ment', 78), ('ex-', 77), ('ary', 74), ('x', 72), ('co', 70), ('tions', 69), ('u', 63), ('pa', 63), ('k', 63), ('en-', 61), ('per-', 59), ('pro-', 58), ('z', 58), ('dis-', 53), ('ple', 51), ('peo-', 49), ('pre-', 48), ('ers', 47), ('un-', 46), ('an-', 46), ('ad-', 44), ('ence', 42), ('io', 42), ('oc', 40), ('ber', 40), ('inter-', 39), ('for-', 38), ('ac-', 38)]
Correction 2 -- Correct Line Endings¶
In [21]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
In [24]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction2 Average verified rate: 0.9817024929526814 Average of error rates: 0.02631636363636364 Total token count: 862030
In [25]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[25]:
[('-', 1157), ("'", 573), ('e', 479), ('w', 475), ('m', 337), ('t', 312), ('r', 305), ('d', 301), ('n', 295), ('f', 267), ('g', 250), ('th', 109), ('x', 72), ('co', 69), ('pa', 63), ('k', 63), ('u', 63), ('z', 58), ('io', 42), ('oc', 40), ('mis-', 39), ('oo', 33), ('cc', 29), ('sionary', 29), ('--', 28), ('money-order', 24), ("'the", 23), ('q', 21), ('al', 21), ('mt', 20), ('ary', 19), ('id', 19), ('spanish-speaking', 19), ('hausaland', 19), ("''", 19), ('stauffer', 19), ('ft', 18), ('mo', 18), ('zo', 18), ('basle', 18), ('re', 18), ('hasegawa', 17), ('couva', 17), ('kalaka', 17), ('-the', 17), ('sul', 17), ('okohira', 16), ('ro', 16), ('sabbathschool', 15), ('pp', 15)]
Correction 3 -- Remove Extra Dashes¶
In [27]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
replacements = []
for token in tokens:
if token[0] is "-":
replacements.append((token, token[1:]))
elif token[-1] is "-":
replacements.append((token, token[:-1]))
else:
pass
if len(replacements) > 0:
print("{}: {}".format(filename, replacements))
for replacement in replacements:
content = clean.replace_pair(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980101-V10-01-page1.txt: [('Mis-', 'Mis')] TMM18980101-V10-01-page12.txt: [('-one', 'one'), ('-brought', 'brought'), ('-their', 'their'), ('-worship', 'worship')] TMM18980101-V10-01-page13.txt: [('-appears', 'appears')] TMM18980101-V10-01-page14.txt: [('-was', 'was')] TMM18980101-V10-01-page15.txt: [('-Baptists', 'Baptists')] TMM18980101-V10-01-page23.txt: [('respond-', 'respond')] TMM18980101-V10-01-page26.txt: [('Waterloo-', 'Waterloo'), ('-Jamaica.', 'Jamaica.'), ('-by', 'by')] TMM18980101-V10-01-page28.txt: [('-this', 'this')] TMM18980101-V10-01-page32.txt: [('-WE', 'WE')] TMM18980101-V10-01-page4.txt: [('--a', '-a'), ('-', ''), ('-rse', 'rse'), ('-', '')] TMM18980101-V10-01-page9.txt: [('ene-', 'ene')] TMM18980201-V10-02-page11.txt: [('---', '--')] TMM18980201-V10-02-page13.txt: [('-K', 'K'), ('-N', 'N'), ('AricuN-', 'AricuN'), ('-', ''), ('-', '')] TMM18980201-V10-02-page14.txt: [('Anglo-', 'Anglo'), ('-too', 'too')] TMM18980201-V10-02-page17.txt: [('-miles', 'miles'), ('op-', 'op')] TMM18980201-V10-02-page22.txt: [('prom-', 'prom')] TMM18980201-V10-02-page27.txt: [('-', '')] TMM18980201-V10-02-page32.txt: [('-', '')] TMM18980201-V10-02-page33.txt: [('CON-', 'CON'), ('-', '')] TMM18980201-V10-02-page35.txt: [('liter-', 'liter')] TMM18980201-V10-02-page37.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('.-', '.')] TMM18980201-V10-02-page38.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM18980201-V10-02-page39.txt: [('SECRE-', 'SECRE'), ('Mts-', 'Mts')] TMM18980201-V10-02-page6.txt: [('-', '')] TMM18980201-V10-02-page7.txt: [('Nes-', 'Nes')] TMM18980201-V10-02-page9.txt: [('-', ''), ('-', '')] TMM18980301-V10-03-page12.txt: [('Gar-', 'Gar')] TMM18980301-V10-03-page16.txt: [('-', '')] TMM18980301-V10-03-page19.txt: [('-', '')] TMM18980301-V10-03-page24.txt: [('Mis-', 'Mis')] TMM18980301-V10-03-page25.txt: [('O-----', 'O----'), ('-reveladas', 'reveladas'), ('galar-', 'galar')] TMM18980301-V10-03-page28.txt: [('com-', 'com')] TMM18980301-V10-03-page31.txt: [('-be', 'be'), ('-work', 'work')] TMM18980301-V10-03-page32.txt: [('---g.', '--g.'), ('-P.', 'P.'), ('-krka', 'krka')] TMM18980301-V10-03-page37.txt: [('estab-', 'estab'), ('Jan-', 'Jan')] TMM18980301-V10-03-page38.txt: [('-', '')] TMM18980301-V10-03-page39.txt: [('Mis-', 'Mis'), ('Mis-', 'Mis'), ('-', '')] TMM18980301-V10-03-page5.txt: [('-', '')] TMM18980301-V10-03-page6.txt: [('C--', 'C-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('..-', '..'), ('-', ''), ('-', '')] TMM18980401-V10-04-page15.txt: [('-', '')] TMM18980401-V10-04-page17.txt: [('-', '')] TMM18980401-V10-04-page26.txt: [('HISTOR-', 'HISTOR')] TMM18980401-V10-04-page3.txt: [('-', '')] TMM18980401-V10-04-page30.txt: [('-', '')] TMM18980401-V10-04-page31.txt: [('-', '')] TMM18980401-V10-04-page33.txt: [('-I', 'I')] TMM18980401-V10-04-page38.txt: [('encourag-', 'encourag')] TMM18980401-V10-04-page4.txt: [('-', ''), ('-c-', 'c-'), ('-', ''), ('s-', 's'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('e-', 'e'), ('-', ''), ('-.t.', '.t.'), ('-', ''), ('-', ''), ('-', ''), ('..-', '..'), ('-', '')] TMM18980401-V10-04-page40.txt: [('--Did', '-Did')] TMM18980401-V10-04-page6.txt: [('WORK-', 'WORK')] TMM18980501-V10-05-page17.txt: [('-', '')] TMM18980501-V10-05-page24.txt: [('-T.', 'T.')] TMM18980501-V10-05-page25.txt: [('-', ''), ('-', '')] TMM18980501-V10-05-page28.txt: [('-.', '.'), ('i-', 'i'), ('-d', 'd'), ('-', ''), ('.-', '.'), ('-', ''), ('-s-azppos', 's-azppos')] TMM18980501-V10-05-page29.txt: [('-', '')] TMM18980501-V10-05-page30.txt: [('-', '')] TMM18980501-V10-05-page31.txt: [('"Teu-', '"Teu')] TMM18980501-V10-05-page35.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', '')] TMM18980501-V10-05-page36.txt: [('-', ''), ('.-', '.')] TMM18980501-V10-05-page37.txt: [('has-', 'has')] TMM18980501-V10-05-page38.txt: [('-THREE', 'THREE')] TMM18980501-V10-05-page39.txt: [('re-', 're')] TMM18980601-V10-06-page16.txt: [('-', ''), ('-', '')] TMM18980601-V10-06-page20.txt: [('-------A--', '------A--'), ('-.', '.'), ('..-', '..'), ('--', '-'), ('--', '-'), ('--..', '-..'), ('-...t', '...t'), ('..-', '..'), ('-', ''), ("....-k'-", "....-k'"), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('K-', 'K'), ('-', ''), ('.-', '.'), ('..--', '..-'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-AN', 'AN'), ('-', ''), ('--', '-'), ('ir-', 'ir'), ('-', ''), ('--', '-'), ('-AI', 'AI'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('.--', '.-'), ('---', '--')] TMM18980601-V10-06-page22.txt: [('lb.-', 'lb.')] TMM18980601-V10-06-page23.txt: [('-JUNE', 'JUNE'), ('-', '')] TMM18980601-V10-06-page24.txt: [('-', ''), ('-', '')] TMM18980601-V10-06-page25.txt: [('-', ''), ('-', '')] TMM18980601-V10-06-page26.txt: [('doing-', 'doing')] TMM18980601-V10-06-page36.txt: [('-', ''), ('leav-', 'leav')] TMM18980701-V10-07-page10.txt: [('-', '')] TMM18980701-V10-07-page11.txt: [('-lying', 'lying')] TMM18980701-V10-07-page12.txt: [('--How', '-How')] TMM18980701-V10-07-page30.txt: [('Amsterdam-', 'Amsterdam')] TMM18980701-V10-07-page36.txt: [('inter-', 'inter')] TMM18980701-V10-07-page38.txt: [('-c', 'c'), ('..-', '..'), ('-', ''), ('-', '')] TMM18980701-V10-07-page4.txt: [('num-', 'num'), ('-', '')] TMM18980701-V10-07-page40.txt: [('MAG-', 'MAG')] TMM18980701-V10-07-page42.txt: [('-', '')] TMM18980701-V10-07-page6.txt: [('Young-', 'Young')] TMM18980801-V10-08-page21.txt: [('Euro-', 'Euro')] TMM18980801-V10-08-page24.txt: [('conse-', 'conse')] TMM18980801-V10-08-page26.txt: [('MISSIONARY-', 'MISSIONARY')] TMM18980801-V10-08-page31.txt: [('z-', 'z'), ('bountifully."-', 'bountifully."')] TMM18980801-V10-08-page35.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('.-', '.'), ('.-', '.')] TMM18980801-V10-08-page38.txt: [('MIS-', 'MIS'), ('MIS-', 'MIS')] TMM18980901-V10-09-page13.txt: [('-in', 'in')] TMM18980901-V10-09-page16.txt: [('house-', 'house')] TMM18980901-V10-09-page31.txt: [('-refining', 'refining'), ('-', '')] TMM18980901-V10-09-page35.txt: [('me-', 'me')] TMM18980901-V10-09-page37.txt: [('tear-drops--', 'tear-drops-')] TMM18980901-V10-09-page8.txt: [('-', '')] TMM18980901-V10-09-page9.txt: [('-view', 'view')] TMM18981001-V10-10-page11.txt: [('-and', 'and'), ('-Fiance', 'Fiance'), ('-', '')] TMM18981001-V10-10-page16.txt: [('-would', 'would')] TMM18981001-V10-10-page18.txt: [('-', '')] TMM18981001-V10-10-page19.txt: [('-', '')] TMM18981001-V10-10-page21.txt: [('-rented', 'rented'), ('-', '')] TMM18981001-V10-10-page28.txt: [('-', ''), ('edu-', 'edu')] TMM18981001-V10-10-page30.txt: [('-', '')] TMM18981001-V10-10-page33.txt: [('-', '')] TMM18981001-V10-10-page34.txt: [('-', ''), ('-', '')] TMM18981001-V10-10-page37.txt: [('-', ''), ('-c', 'c')] TMM18981001-V10-10-page38.txt: [('-', '')] TMM18981101-V10-11-page12.txt: [('-', '')] TMM18981101-V10-11-page17.txt: [('MIS-', 'MIS')] TMM18981101-V10-11-page20.txt: [('cor-', 'cor')] TMM18981101-V10-11-page25.txt: [('MAGA-', 'MAGA')] TMM18981101-V10-11-page27.txt: [('fol-', 'fol')] TMM18981101-V10-11-page29.txt: [('-rendering', 'rendering')] TMM18981101-V10-11-page30.txt: [('-', ''), ('liter-', 'liter')] TMM18981101-V10-11-page31.txt: [('-', '')] TMM18981101-V10-11-page33.txt: [('-at', 'at')] TMM18981101-V10-11-page34.txt: [('-', '')] TMM18981101-V10-11-page35.txt: [('-', '')] TMM18981101-V10-11-page36.txt: [('SOCI-', 'SOCI'), ('.-', '.'), ('MIS-', 'MIS'), ('QUAR-', 'QUAR')] TMM18981101-V10-11-page37.txt: [('-', ''), ('MAGA-', 'MAGA')] TMM18981101-V10-11-page38.txt: [('MAGA-', 'MAGA')] TMM18981101-V10-11-page6.txt: [('great-', 'great')] TMM18981101-V10-11-page7.txt: [('-', '')] TMM18981201-V10-12-page13.txt: [('igno-', 'igno')] TMM18981201-V10-12-page17.txt: [('interme-', 'interme')] TMM18981201-V10-12-page19.txt: [('-little', 'little')] TMM18981201-V10-12-page2.txt: [('-mighty', 'mighty'), ('op-', 'op')] TMM18981201-V10-12-page23.txt: [('-her.', 'her.')] TMM18981201-V10-12-page27.txt: [('-', '')] TMM18981201-V10-12-page32.txt: [('-', '')] TMM18981201-V10-12-page36.txt: [('Sab-', 'Sab')] TMM18981201-V10-12-page4.txt: [('-', '')] TMM18981201-V10-12-page41.txt: [('-', '')] TMM18981201-V10-12-page43.txt: [('--Near', '-Near'), ('-THE', 'THE')] TMM18981201-V10-12-page44.txt: [('Par-', 'Par')] TMM18981201-V10-12-page45.txt: [('Character-', 'Character')] TMM18981201-V10-12-page46.txt: [('Mission-', 'Mission')] TMM18990101-V11-01-page12.txt: [('-teach', 'teach'), ('-standing', 'standing'), ('-the', 'the')] TMM18990101-V11-01-page13.txt: [('-', '')] TMM18990101-V11-01-page14.txt: [('-', ''), ("'-", "'"), ('oranges-and-', 'oranges-and'), ('-I', 'I'), ('-', ''), ('-of', 'of'), ('-mines', 'mines'), ('-', '')] TMM18990101-V11-01-page17.txt: [('flower-', 'flower')] TMM18990101-V11-01-page19.txt: [('-', '')] TMM18990101-V11-01-page2.txt: [('THI-', 'THI')] TMM18990101-V11-01-page25.txt: [('-', '')] TMM18990101-V11-01-page27.txt: [('-the', 'the'), ('-people', 'people'), ('-their', 'their'), ('-of', 'of')] TMM18990101-V11-01-page28.txt: [('-', '')] TMM18990101-V11-01-page29.txt: [('DAR-ES-', 'DAR-ES'), ('MIS-', 'MIS')] TMM18990101-V11-01-page31.txt: [('the-', 'the')] TMM18990101-V11-01-page32.txt: [('-voyage.', 'voyage.')] TMM18990101-V11-01-page36.txt: [('success-', 'success')] TMM18990101-V11-01-page38.txt: [('whole-', 'whole')] TMM18990101-V11-01-page44.txt: [('stop--', 'stop-')] TMM18990101-V11-01-page45.txt: [('-', '')] TMM18990101-V11-01-page47.txt: [('-', ''), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')] TMM18990101-V11-01-page48.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('repre-', 'repre'), ('-.', '.')] TMM18990101-V11-01-page8.txt: [('-rainy', 'rainy')] TMM18990101-V11-01-page9.txt: [('-', ''), ('Spanish-', 'Spanish')] TMM18990201-V11-02-page1.txt: [('Guiana-', 'Guiana')] TMM18990201-V11-02-page13.txt: [('-', '')] TMM18990201-V11-02-page15.txt: [('-', '')] TMM18990201-V11-02-page18.txt: [('un-', 'un')] TMM18990201-V11-02-page23.txt: [('-', '')] TMM18990201-V11-02-page25.txt: [('-', '')] TMM18990201-V11-02-page28.txt: [('-', '')] TMM18990201-V11-02-page29.txt: [('WORK-', 'WORK')] TMM18990201-V11-02-page30.txt: [('-as', 'as'), ('-development', 'development'), ("-of'", "of'"), ('suf-', 'suf')] TMM18990201-V11-02-page38.txt: [('-they', 'they')] TMM18990201-V11-02-page41.txt: [('con-', 'con')] TMM18990201-V11-02-page47.txt: [('-', ''), ('-', ''), ('-', '')] TMM18990201-V11-02-page48.txt: [('inquir-', 'inquir')] TMM18990201-V11-02-page51.txt: [('-', '')] TMM18990201-V11-02-page52.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', ''), ('.-', '.'), ('.-', '.'), ('-', ''), ('.-', '.'), ('-', ''), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('-', ''), ('r.-', 'r.'), ('.-', '.')] TMM18990201-V11-02-page53.txt: [('other-', 'other')] TMM18990201-V11-02-page54.txt: [('-', ''), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')] TMM18990201-V11-02-page55.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ("-'", "'"), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-P', 'P')] TMM18990201-V11-02-page9.txt: [('-', '')] TMM18990301-V11-03-page11.txt: [('for-', 'for'), ('-', '')] TMM18990301-V11-03-page13.txt: [('-', ''), ("----'", "---'")] TMM18990301-V11-03-page25.txt: [('-"Christian', '"Christian')] TMM18990301-V11-03-page29.txt: [('HALE.-', 'HALE.')] TMM18990301-V11-03-page30.txt: [('-still', 'still')] TMM18990301-V11-03-page31.txt: [('HOFFMAN-', 'HOFFMAN')] TMM18990301-V11-03-page37.txt: [('-', ''), ('con-', 'con')] TMM18990301-V11-03-page38.txt: [('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')] TMM18990301-V11-03-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.')] TMM18990301-V11-03-page40.txt: [('-', '')] TMM18990301-V11-03-page7.txt: [('-Burrus', 'Burrus')] TMM18990301-V11-03-page9.txt: [('frame-', 'frame')] TMM18990401-V11-04-page10.txt: [('the-', 'the')] TMM18990401-V11-04-page18.txt: [('-', '')] TMM18990401-V11-04-page23.txt: [('-', '')] TMM18990401-V11-04-page26.txt: [('-and', 'and')] TMM18990401-V11-04-page29.txt: [('-', ''), ('-', ''), ('-', '')] TMM18990401-V11-04-page38.txt: [('Indo-', 'Indo'), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')] TMM18990401-V11-04-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-I-', 'I-'), ('PreNit-', 'PreNit'), ('-', ''), ('repre-', 'repre'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM18990501-V11-05-page16.txt: [('-Syria', 'Syria')] TMM18990501-V11-05-page24.txt: [('of-', 'of')] TMM18990501-V11-05-page31.txt: [('English-', 'English'), ('French-', 'French')] TMM18990501-V11-05-page34.txt: [('-SABBATH', 'SABBATH')] TMM18990501-V11-05-page35.txt: [('-', ''), ('-', '')] TMM18990501-V11-05-page37.txt: [('-', '')] TMM18990501-V11-05-page39.txt: [('-', '')] TMM18990501-V11-05-page41.txt: [('-', '')] TMM18990501-V11-05-page42.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', ''), ('.-', '.'), ('-', '')] TMM18990501-V11-05-page43.txt: [('con-', 'con')] TMM18990501-V11-05-page45.txt: [('--IA', '-IA')] TMM18990501-V11-05-page46.txt: [('-', ''), ('time.-', 'time.'), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')] TMM18990501-V11-05-page47.txt: [('-.', '.'), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('repre-', 'repre')] TMM18990601-V11-06-page1.txt: [('-wide', 'wide')] TMM18990601-V11-06-page11.txt: [('-', ''), ('-every', 'every'), ('fever.-', 'fever.'), ('-no', 'no')] TMM18990601-V11-06-page12.txt: [('-lines', 'lines')] TMM18990601-V11-06-page2.txt: [('-is', 'is')] TMM18990601-V11-06-page3.txt: [('-is', 'is')] TMM18990601-V11-06-page30.txt: [('-', '')] TMM18990601-V11-06-page38.txt: [('-', '')] TMM18990601-V11-06-page39.txt: [('-almost', 'almost'), ('-heathen', 'heathen'), ('-uses', 'uses'), ('-the', 'the'), ('-very', 'very'), ('-', '')] TMM18990601-V11-06-page4.txt: [('-we', 'we'), ('-DO', 'DO'), ('work.-', 'work.'), ('-for', 'for')] TMM18990601-V11-06-page46.txt: [('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION'), ('-', '')] TMM18990601-V11-06-page47.txt: [('-', '')] TMM18990601-V11-06-page7.txt: [('things.--', 'things.-')] TMM18990701-V11-07-page11.txt: [('ex-', 'ex'), ('p-', 'p')] TMM18990701-V11-07-page13.txt: [('-the', 'the')] TMM18990701-V11-07-page16.txt: [('the-', 'the')] TMM18990701-V11-07-page17.txt: [('-of', 'of'), ('-these', 'these'), ('-the', 'the'), ('-to', 'to')] TMM18990701-V11-07-page19.txt: [('reading--', 'reading-')] TMM18990701-V11-07-page2.txt: [('-the', 'the'), ('-term', 'term'), ('"When-', '"When'), ('-teacher', 'teacher'), ('-to', 'to')] TMM18990701-V11-07-page20.txt: [('theerec-', 'theerec')] TMM18990701-V11-07-page23.txt: [('-provision', 'provision'), ('reached-', 'reached'), ('Iztaccihuatl-', 'Iztaccihuatl')] TMM18990701-V11-07-page26.txt: [('V-', 'V')] TMM18990701-V11-07-page27.txt: [('so--', 'so-')] TMM18990701-V11-07-page28.txt: [('-because', 'because')] TMM18990701-V11-07-page32.txt: [('.-', '.'), ('-', '')] TMM18990701-V11-07-page36.txt: [('formerly-', 'formerly'), ('receiving-', 'receiving')] TMM18990701-V11-07-page37.txt: [('pray-', 'pray'), ('-era.', 'era.')] TMM18990701-V11-07-page4.txt: [('-recent', 'recent')] TMM18990701-V11-07-page40.txt: [('-', '')] TMM18990701-V11-07-page42.txt: [('-', ''), ('of-', 'of')] TMM18990701-V11-07-page43.txt: [('-fices', 'fices')] TMM18990701-V11-07-page46.txt: [('the-', 'the'), ('MIS-', 'MIS')] TMM18990701-V11-07-page5.txt: [('gov-', 'gov'), ('-to', 'to')] TMM18990701-V11-07-page9.txt: [('Fahren-', 'Fahren')] TMM18990801-V11-08-page11.txt: [('-.', '.'), ('-', ''), ('-', ''), ('-.', '.'), ('-E', 'E'), ('--s', '-s'), ('-"', '"'), ('-I.', 'I.'), ('C".-', 'C".'), ('-t-', 't-'), ('-', ''), ('-C', 'C'), ("-'", "'"), ('-', ''), ('Vcc-', 'Vcc'), ('-', ''), ('-', ''), ('-l', 'l'), ('-c', 'c'), ('-P', 'P'), ('TIc.-', 'TIc.'), ('-Lt.', 'Lt.'), ('a-', 'a'), ('-C', 'C'), ('-.', '.'), ('-c', 'c'), ('-c', 'c'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('m-', 'm'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-c', 'c'), ('..-', '..'), ('-L', 'L'), ('lec-', 'lec'), ('-', ''), ('.F-', '.F'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-se-tt', 'se-tt'), ('N-', 'N'), ('-', ''), ("it'-", "it'"), ('-V', 'V'), ('-', ''), ('-', ''), ('-', ''), ('iV-', 'iV'), ('-', ''), ('-', ''), ('I.-', 'I.'), ('-', ''), ('-', ''), ('-.', '.'), ("-'", "'"), ('--', '-'), ('-it', 'it'), ('mew.Pgx-', 'mew.Pgx'), ("-T'..", "T'.."), ('-', ''), ('lectlf-', 'lectlf'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-.c', '.c'), ('-', ''), ('-.r.', '.r.'), ('-P', 'P'), ('-', ''), ('-', ''), (".''rt.Mgk-", ".''rt.Mgk"), ('-', ''), ('-.', '.'), ('-c', 'c'), ('-', ''), ('-', ''), ('cte-', 'cte'), ("-'", "'"), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('.-', '.'), ('-c', 'c'), ('-', ''), ('Llc-', 'Llc'), ('-', ''), ('-LAE', 'LAE'), ('-', ''), ('-', ''), ('-ore', 'ore')] TMM18990801-V11-08-page18.txt: [('-', '')] TMM18990801-V11-08-page2.txt: [('-', '')] TMM18990801-V11-08-page21.txt: [('-', '')] TMM18990801-V11-08-page31.txt: [('-upon', 'upon'), ('-that', 'that'), ('-.means', '.means')] TMM18990801-V11-08-page32.txt: [('-', ''), ('-', '')] TMM18990801-V11-08-page33.txt: [('-that', 'that'), ('-thousand', 'thousand'), ('-', '')] TMM18990801-V11-08-page36.txt: [('orr-', 'orr')] TMM18990801-V11-08-page37.txt: [('-"like', '"like'), ('-that', 'that'), ('-', '')] TMM18990801-V11-08-page40.txt: [('-that', 'that')] TMM18990801-V11-08-page41.txt: [('-emptied', 'emptied'), ('Him-', 'Him'), ('-self', 'self'), ('-consume', 'consume'), ('-', '')] TMM18990801-V11-08-page43.txt: [('-', '')] TMM18990801-V11-08-page45.txt: [('MIS-', 'MIS'), ('.-', '.'), ('-', ''), ('-', '')] TMM18990801-V11-08-page46.txt: [('-OFFICE', 'OFFICE'), ('-', ''), ('MIS-', 'MIS')] TMM18990801-V11-08-page9.txt: [('-', '')] TMM18990901-V11-09-page1.txt: [('-', '')] TMM18990901-V11-09-page23.txt: [('-', '')] TMM18990901-V11-09-page25.txt: [('-the', 'the'), ('-they', 'they'), ('-teach', 'teach')] TMM18990901-V11-09-page27.txt: [('-to', 'to')] TMM18990901-V11-09-page3.txt: [('-wa-re', 'wa-re'), ('MISSION-', 'MISSION')] TMM18990901-V11-09-page34.txt: [('-', '')] TMM18990901-V11-09-page36.txt: [('.-', '.')] TMM18990901-V11-09-page44.txt: [('-', '')] TMM18990901-V11-09-page46.txt: [('-', ''), ('MIS-', 'MIS')] TMM18990901-V11-09-page47.txt: [('-A--', 'A--'), ('-', ''), ('-al', 'al'), ('-', '')] TMM18991001-V11-10-page1.txt: [('wit-', 'wit')] TMM18991001-V11-10-page10.txt: [('--', '-')] TMM18991001-V11-10-page14.txt: [('-', ''), ('QUEENSLAND.-', 'QUEENSLAND.')] TMM18991001-V11-10-page16.txt: [('-', '')] TMM18991001-V11-10-page2.txt: [('-', '')] TMM18991001-V11-10-page3.txt: [('-that', 'that')] TMM18991001-V11-10-page30.txt: [('-devote', 'devote')] TMM18991001-V11-10-page4.txt: [('-', '')] TMM18991001-V11-10-page44.txt: [('-', ''), ('be-', 'be')] TMM18991001-V11-10-page45.txt: [('-What', 'What')] TMM18991001-V11-10-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM18991001-V11-10-page5.txt: [('Mc-', 'Mc'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM18991101-V11-11-page12.txt: [('-we', 'we')] TMM18991101-V11-11-page23.txt: [('.-', '.'), ("c'te-", "c'te"), ('-', ''), ('i"-', 'i"'), ("-'...", "'..."), ('-', ''), ('F--', 'F-'), ('C-', 'C'), ('-.', '.')] TMM18991101-V11-11-page24.txt: [('-', ''), ('-', ''), ('-a', 'a')] TMM18991101-V11-11-page27.txt: [('-to', 'to')] TMM18991101-V11-11-page32.txt: [('-numbered', 'numbered'), ('-', '')] TMM18991101-V11-11-page33.txt: [('-holy', 'holy')] TMM18991101-V11-11-page37.txt: [('MAGA-', 'MAGA')] TMM18991101-V11-11-page40.txt: [('-', ''), ('---First', '--First')] TMM18991101-V11-11-page42.txt: [('MIS-', 'MIS')] TMM18991101-V11-11-page43.txt: [('sur-', 'sur')] TMM18991101-V11-11-page44.txt: [('-', ''), ('Side-', 'Side')] TMM18991101-V11-11-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM18991101-V11-11-page47.txt: [('r-', 'r'), ('-', ''), ('A-', 'A')] TMM18991201-V11-12-page18.txt: [('.-', '.')] TMM18991201-V11-12-page20.txt: [('-', '')] TMM18991201-V11-12-page29.txt: [('-', '')] TMM18991201-V11-12-page3.txt: [('mist-', 'mist')] TMM18991201-V11-12-page34.txt: [('-than', 'than')] TMM18991201-V11-12-page36.txt: [('-', '')] TMM18991201-V11-12-page37.txt: [('-', '')] TMM18991201-V11-12-page38.txt: [('-', ''), ('-', '')] TMM18991201-V11-12-page39.txt: [('-', ''), ('-', ''), ('-', '')] TMM18991201-V11-12-page40.txt: [('-', ''), ('-', ''), ('-', '')] TMM18991201-V11-12-page41.txt: [('-', ''), ('-', '')] TMM18991201-V11-12-page42.txt: [('-', '')] TMM18991201-V11-12-page45.txt: [('-', '')] TMM18991201-V11-12-page46.txt: [('-', ''), ('MIS-', 'MIS')] TMM18991201-V11-12-page9.txt: [('-', '')] TMM19000101-V12-01-page10.txt: [('-i', 'i'), ('-.', '.'), ('-', ''), ('--', '-'), ('---', '--'), ('------', '-----'), ('A-', 'A'), ('-', ''), ('-', ''), ("-.'", ".'"), ('-', ''), ('-', ''), ('..-', '..'), ('-.Z---....', '.Z---....'), ('-', ''), ("--.'b", "-.'b"), ('-', ''), ('X-', 'X'), ("-l'''", "l'''"), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('..-', '..')] TMM19000101-V12-01-page2.txt: [('con-', 'con')] TMM19000101-V12-01-page25.txt: [('self-', 'self')] TMM19000101-V12-01-page3.txt: [('-v-tvot', 'v-tvot')] TMM19000101-V12-01-page30.txt: [('-drew', 'drew'), ('primi-', 'primi')] TMM19000101-V12-01-page33.txt: [('-', '')] TMM19000101-V12-01-page34.txt: [('-', '')] TMM19000101-V12-01-page37.txt: [('MAGA-', 'MAGA')] TMM19000101-V12-01-page38.txt: [('MAG-', 'MAG'), ('-', ''), ('-', '')] TMM19000101-V12-01-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000101-V12-01-page4.txt: [('Nice."-', 'Nice."')] TMM19000101-V12-01-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('judg-', 'judg'), ('-', '')] TMM19000101-V12-01-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000101-V12-01-page42.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000101-V12-01-page43.txt: [('-', ''), ('-', '')] TMM19000101-V12-01-page44.txt: [('MIS-', 'MIS')] TMM19000101-V12-01-page46.txt: [('-BISHOP', 'BISHOP')] TMM19000101-V12-01-page47.txt: [('MAGA-', 'MAGA'), ('MISSION-', 'MISSION')] TMM19000101-V12-01-page48.txt: [('-MISSIONARY', 'MISSIONARY')] TMM19000101-V12-01-page50.txt: [('MAGA-', 'MAGA'), ('MIS-', 'MIS')] TMM19000101-V12-01-page51.txt: [('-.Seventh', '.Seventh'), ('-page', 'page'), ('earn-', 'earn'), ('-', ''), ('PRO-', 'PRO')] TMM19000101-V12-01-page52.txt: [('DEVELOP-', 'DEVELOP')] TMM19000101-V12-01-page6.txt: [('un-', 'un')] TMM19000201-V12-02-page1.txt: [('IN-', 'IN')] TMM19000201-V12-02-page13.txt: [('-', '')] TMM19000201-V12-02-page15.txt: [('weak-', 'weak')] TMM19000201-V12-02-page19.txt: [('-cannot', 'cannot')] TMM19000201-V12-02-page2.txt: [('-sold.', 'sold.')] TMM19000201-V12-02-page3.txt: [('-', '')] TMM19000201-V12-02-page32.txt: [('-', '')] TMM19000201-V12-02-page33.txt: [('-', '')] TMM19000201-V12-02-page34.txt: [('MAGAZINE-', 'MAGAZINE'), ('-', ''), ('-', ''), ('-', '')] TMM19000201-V12-02-page35.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000201-V12-02-page36.txt: [('-', ''), ('-', ''), ('Medo-', 'Medo'), ('corre-', 'corre'), ('-each', 'each'), ('-', ''), ('-', '')] TMM19000201-V12-02-page37.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000201-V12-02-page38.txt: [('-', ''), ('-', '')] TMM19000201-V12-02-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-MARCH', 'MARCH')] TMM19000201-V12-02-page40.txt: [('-', ''), ('-', '')] TMM19000201-V12-02-page41.txt: [('-', '')] TMM19000201-V12-02-page43.txt: [('mission-', 'mission')] TMM19000201-V12-02-page44.txt: [('-farm', 'farm'), ('-their', 'their')] TMM19000201-V12-02-page46.txt: [('MIS-', 'MIS')] TMM19000201-V12-02-page47.txt: [('in-', 'in')] TMM19000201-V12-02-page49.txt: [('Miss-', 'Miss')] TMM19000201-V12-02-page50.txt: [('-', ''), ('MIS-', 'MIS')] TMM19000201-V12-02-page51.txt: [('ANIMAL."PRO-', 'ANIMAL."PRO'), ('-', ''), ('-text', 'text'), ('-', ''), ('-tiFFI', 'tiFFI'), ('H.-', 'H.'), ('-', ''), ('-', ''), ('-page', 'page'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000201-V12-02-page52.txt: [('-', ''), ('al-', 'al')] TMM19000301-V12-03-page10.txt: [('Hongkong----', 'Hongkong---'), ('-', ''), ('-who', 'who')] TMM19000301-V12-03-page11.txt: [('-', '')] TMM19000301-V12-03-page13.txt: [('table-', 'table')] TMM19000301-V12-03-page18.txt: [('-', ''), ('going-', 'going')] TMM19000301-V12-03-page2.txt: [('-', ''), ('-', '')] TMM19000301-V12-03-page26.txt: [('-', '')] TMM19000301-V12-03-page34.txt: [('-', ''), ('na-', 'na')] TMM19000301-V12-03-page35.txt: [('-', '')] TMM19000301-V12-03-page36.txt: [('-', ''), ('-', '')] TMM19000301-V12-03-page39.txt: [('-', '')] TMM19000301-V12-03-page41.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000301-V12-03-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000301-V12-03-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000301-V12-03-page44.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000301-V12-03-page45.txt: [('Superin-', 'Superin'), ('-', ''), ('thy-', 'thy'), ('--Ider', '-Ider'), ('-school', 'school'), ('-turn', 'turn')] TMM19000301-V12-03-page47.txt: [('-a', 'a'), ('-a', 'a'), ('C-', 'C')] TMM19000301-V12-03-page48.txt: [('-', ''), ('-ton', 'ton')] TMM19000301-V12-03-page5.txt: [('expres-', 'expres'), ('-They', 'They')] TMM19000301-V12-03-page8.txt: [('Wall-', 'Wall'), ('-', '')] TMM19000301-V12-03-page9.txt: [('M-', 'M'), ('-c.', 'c.'), ('X-', 'X'), ('---', '--')] TMM19000401-V12-04-page1.txt: [('-', '')] TMM19000401-V12-04-page14.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000401-V12-04-page15.txt: [('-', '')] TMM19000401-V12-04-page2.txt: [('con-', 'con')] TMM19000401-V12-04-page33.txt: [('-future', 'future')] TMM19000401-V12-04-page39.txt: [('-', '')] TMM19000401-V12-04-page40.txt: [('-utmost', 'utmost')] TMM19000401-V12-04-page43.txt: [('-', '')] TMM19000401-V12-04-page44.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000401-V12-04-page45.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000401-V12-04-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('expedi-', 'expedi')] TMM19000401-V12-04-page47.txt: [('-', ''), ('-', '')] TMM19000401-V12-04-page48.txt: [('-public', 'public')] TMM19000401-V12-04-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM19000401-V12-04-page51.txt: [('-r', 'r'), ('-need-to', 'need-to'), ('Sub-', 'Sub'), ('C-', 'C'), ('Postpaid.-', 'Postpaid.'), ('-', '')] TMM19000401-V12-04-page52.txt: [('-York.', 'York.')] TMM19000401-V12-04-page7.txt: [('-', ''), ('-', '')] TMM19000401-V12-04-page8.txt: [('con-', 'con')] TMM19000501-V12-05-page10.txt: [('Saint-', 'Saint')] TMM19000501-V12-05-page11.txt: [('-', '')] TMM19000501-V12-05-page12.txt: [('cere-', 'cere')] TMM19000501-V12-05-page14.txt: [('examina-', 'examina')] TMM19000501-V12-05-page15.txt: [('-', '')] TMM19000501-V12-05-page22.txt: [('-', '')] TMM19000501-V12-05-page25.txt: [('-', ''), ('---', '--'), ('-------', '------'), ('--.', '-.'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('----', '---'), ('--.', '-.'), ('-', ''), ('-.-.', '.-.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.----', '.---'), ('---', '--'), ('------.', '-----.'), ('-', ''), ('t.---', 't.--'), ('-', ''), ('-', ''), ('----.-', '---.-'), ('-', '')] TMM19000501-V12-05-page26.txt: [('third-', 'third')] TMM19000501-V12-05-page29.txt: [('restric-', 'restric')] TMM19000501-V12-05-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000501-V12-05-page32.txt: [('MISSION-', 'MISSION')] TMM19000501-V12-05-page37.txt: [('re-', 're')] TMM19000501-V12-05-page39.txt: [('-MAY', 'MAY'), ('-', ''), ('-', ''), ('-', '')] TMM19000501-V12-05-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000501-V12-05-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000501-V12-05-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-JUNE', 'JUNE'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000501-V12-05-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000501-V12-05-page44.txt: [('-', '')] TMM19000501-V12-05-page45.txt: [('MIS-', 'MIS')] TMM19000501-V12-05-page5.txt: [('sol-', 'sol'), ('second-', 'second')] TMM19000501-V12-05-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM19000501-V12-05-page51.txt: [('A-', 'A'), ('God.-', 'God.'), ('be-', 'be'), ('be-', 'be'), ('-', ''), ('A-', 'A'), ('-', ''), ("'si-", "'si"), ('-', ''), ('A-', 'A'), ('-', ''), ('-', '')] TMM19000501-V12-05-page6.txt: [('idola-', 'idola')] TMM19000601-V12-06-page12.txt: [('-', '')] TMM19000601-V12-06-page13.txt: [('-the', 'the')] TMM19000601-V12-06-page19.txt: [('ap-', 'ap')] TMM19000601-V12-06-page23.txt: [('surround-', 'surround')] TMM19000601-V12-06-page27.txt: [('-', '')] TMM19000601-V12-06-page28.txt: [('con-', 'con')] TMM19000601-V12-06-page29.txt: [('amplifi-', 'amplifi'), ('-', '')] TMM19000601-V12-06-page33.txt: [('the-', 'the')] TMM19000601-V12-06-page34.txt: [('-', '')] TMM19000601-V12-06-page37.txt: [('-', ''), ('devasta-', 'devasta')] TMM19000601-V12-06-page38.txt: [('-acre', 'acre')] TMM19000601-V12-06-page39.txt: [('-foot', 'foot')] TMM19000601-V12-06-page44.txt: [('com-', 'com'), ('-', ''), ('-', '')] TMM19000601-V12-06-page45.txt: [('-', ''), ('-', '')] TMM19000601-V12-06-page48.txt: [('-', '')] TMM19000601-V12-06-page49.txt: [('cur-', 'cur')] TMM19000601-V12-06-page5.txt: [('prepara-', 'prepara')] TMM19000601-V12-06-page50.txt: [('MIS-', 'MIS')] TMM19000601-V12-06-page51.txt: [('A-', 'A'), ('A-', 'A'), ('be-', 'be'), ('-', '')] TMM19000601-V12-06-page52.txt: [('-', '')] TMM19000701-V12-07-page11.txt: [('reveren-', 'reveren')] TMM19000701-V12-07-page13.txt: [('mem-', 'mem')] TMM19000701-V12-07-page14.txt: [('-', '')] TMM19000701-V12-07-page26.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000701-V12-07-page32.txt: [('con-', 'con')] TMM19000701-V12-07-page34.txt: [('-', '')] TMM19000701-V12-07-page36.txt: [('-', '')] TMM19000701-V12-07-page44.txt: [('READING-', 'READING')] TMM19000701-V12-07-page46.txt: [('-', ''), ('-', '')] TMM19000701-V12-07-page47.txt: [('-', ''), ('-', '')] TMM19000701-V12-07-page48.txt: [('-', '')] TMM19000701-V12-07-page49.txt: [('-', '')] TMM19000701-V12-07-page5.txt: [('-though', 'though')] TMM19000701-V12-07-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM19000701-V12-07-page51.txt: [('A-', 'A'), ('be-', 'be'), ('A-', 'A')] TMM19000701-V12-07-page6.txt: [('congrega-', 'congrega')] TMM19000701-V12-07-page9.txt: [('promul-', 'promul'), ('sub-', 'sub')] TMM19000801-V12-08-page1.txt: [('-Vol.', 'Vol.')] TMM19000801-V12-08-page10.txt: [('funda-', 'funda')] TMM19000801-V12-08-page17.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000801-V12-08-page19.txt: [('road-', 'road'), ('be-', 'be')] TMM19000801-V12-08-page2.txt: [('con-', 'con')] TMM19000801-V12-08-page20.txt: [('-', ''), ('-', ''), ('-', '')] TMM19000801-V12-08-page21.txt: [('appar-', 'appar')] TMM19000801-V12-08-page23.txt: [('-', '')] TMM19000801-V12-08-page26.txt: [('Astra-', 'Astra')] TMM19000801-V12-08-page27.txt: [('Tscher-', 'Tscher')] TMM19000801-V12-08-page28.txt: [('shin-', 'shin')] TMM19000801-V12-08-page3.txt: [('-', '')] TMM19000801-V12-08-page31.txt: [('--', '-')] TMM19000801-V12-08-page33.txt: [('-', '')] TMM19000801-V12-08-page38.txt: [('-', '')] TMM19000801-V12-08-page39.txt: [('-', ''), ('-', '')] TMM19000801-V12-08-page40.txt: [('v-', 'v')] TMM19000801-V12-08-page44.txt: [('MIS-', 'MIS'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('-', ''), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')] TMM19000801-V12-08-page45.txt: [('-', ''), ('.-', '.')] TMM19000801-V12-08-page46.txt: [('-THE', 'THE')] TMM19000801-V12-08-page47.txt: [('-', ''), ('-', '')] TMM19000801-V12-08-page48.txt: [('pos-', 'pos')] TMM19000801-V12-08-page49.txt: [('GA-', 'GA')] TMM19000801-V12-08-page5.txt: [('View-', 'View'), ('Gos-', 'Gos')] TMM19000801-V12-08-page50.txt: [('MtssioN-', 'MtssioN'), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM19000801-V12-08-page7.txt: [('presi-', 'presi')] TMM19000901-V12-09-page11.txt: [('-', '')] TMM19000901-V12-09-page16.txt: [('an-', 'an'), ('cere-', 'cere'), ('con-', 'con')] TMM19000901-V12-09-page24.txt: [('-to', 'to')] TMM19000901-V12-09-page25.txt: [('-', ''), ('man-', 'man')] TMM19000901-V12-09-page28.txt: [('di-', 'di')] TMM19000901-V12-09-page29.txt: [('-return', 'return'), ('--', '-'), ('rep-', 'rep')] TMM19000901-V12-09-page32.txt: [('-', '')] TMM19000901-V12-09-page33.txt: [('-one', 'one'), ('adelan-', 'adelan')] TMM19000901-V12-09-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19000901-V12-09-page43.txt: [('-', '')] TMM19000901-V12-09-page44.txt: [('-SEPTEMBER', 'SEPTEMBER'), ('-', ''), ('-', '')] TMM19000901-V12-09-page45.txt: [('-', ''), ('-', '')] TMM19000901-V12-09-page46.txt: [('-twice', 'twice')] TMM19000901-V12-09-page47.txt: [('Ho-', 'Ho')] TMM19000901-V12-09-page48.txt: [('-and', 'and'), ('-charge', 'charge'), ('-connection', 'connection')] TMM19000901-V12-09-page49.txt: [('-', ''), ('-', '')] TMM19000901-V12-09-page50.txt: [('MIS-', 'MIS'), ('.-', '.')] TMM19000901-V12-09-page51.txt: [('con-', 'con'), ('-', '')] TMM19000901-V12-09-page52.txt: [('Blau-', 'Blau')] TMM19001001-V12-10-page1.txt: [('-i', 'i'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', '')] TMM19001001-V12-10-page14.txt: [('chil-', 'chil')] TMM19001001-V12-10-page15.txt: [('--that', '-that')] TMM19001001-V12-10-page18.txt: [('-', '')] TMM19001001-V12-10-page2.txt: [('-', ''), ('-Australia', 'Australia')] TMM19001001-V12-10-page23.txt: [('Pi-', 'Pi')] TMM19001001-V12-10-page29.txt: [('-', ''), ('hold-', 'hold')] TMM19001001-V12-10-page30.txt: [('scamper-', 'scamper')] TMM19001001-V12-10-page44.txt: [('E-', 'E'), ('-', '')] TMM19001001-V12-10-page45.txt: [('-', '')] TMM19001001-V12-10-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001001-V12-10-page47.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001001-V12-10-page49.txt: [('-paid', 'paid')] TMM19001001-V12-10-page50.txt: [('--August', '-August'), ('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM19001001-V12-10-page51.txt: [('-', '')] TMM19001001-V12-10-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-BEARING', 'BEARING'), ('utensil-', 'utensil')] TMM19001001-V12-10-page6.txt: [('fellowship-', 'fellowship'), ('influ-', 'influ')] TMM19001001-V12-10-page8.txt: [('MISSION-', 'MISSION'), ('MAG-', 'MAG'), ('condi-', 'condi')] TMM19001101-V12-11-page1.txt: [('-', '')] TMM19001101-V12-11-page13.txt: [('believ-', 'believ')] TMM19001101-V12-11-page15.txt: [('cab-', 'cab')] TMM19001101-V12-11-page18.txt: [('igno-', 'igno')] TMM19001101-V12-11-page19.txt: [('se-', 'se'), ('rever-', 'rever')] TMM19001101-V12-11-page20.txt: [('French-', 'French')] TMM19001101-V12-11-page22.txt: [('-United', 'United')] TMM19001101-V12-11-page24.txt: [('produc-', 'produc'), ('-', ''), ('ap-', 'ap')] TMM19001101-V12-11-page27.txt: [('-', ''), ('-', '')] TMM19001101-V12-11-page31.txt: [('-the', 'the')] TMM19001101-V12-11-page32.txt: [('-', '')] TMM19001101-V12-11-page34.txt: [('-', '')] TMM19001101-V12-11-page38.txt: [('-NovEmnEn', 'NovEmnEn'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001101-V12-11-page39.txt: [('-', ''), ('-Our', 'Our'), ('-', '')] TMM19001101-V12-11-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001101-V12-11-page41.txt: [('-That', 'That'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001101-V12-11-page42.txt: [('-DECEMBER', 'DECEMBER'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001101-V12-11-page43.txt: [('-', '')] TMM19001101-V12-11-page44.txt: [('-', ''), ('-', ''), ('MIS-', 'MIS'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')] TMM19001101-V12-11-page45.txt: [('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')] TMM19001101-V12-11-page46.txt: [('-', ''), ('-', ''), ('.-', '.')] TMM19001101-V12-11-page48.txt: [('-THE', 'THE')] TMM19001101-V12-11-page49.txt: [('Sabbath-', 'Sabbath')] TMM19001101-V12-11-page50.txt: [('MIS-', 'MIS')] TMM19001101-V12-11-page51.txt: [('-', ''), ('makeCon-', 'makeCon'), ('-cured.', 'cured.'), ('.successful-', '.successful'), ('-', ''), ('-', ''), ('-name', 'name')] TMM19001101-V12-11-page52.txt: [('-ought', 'ought'), ('-BEARING', 'BEARING'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ("-'not", "'not"), ('-With', 'With'), ('--', '-')] TMM19001101-V12-11-page8.txt: [('-', '')] TMM19001201-V12-12-page10.txt: [('ba-', 'ba')] TMM19001201-V12-12-page14.txt: [('-', ''), ('-lb-', 'lb-'), ('-', ''), ('-', ''), ('-', '')] TMM19001201-V12-12-page19.txt: [('-', '')] TMM19001201-V12-12-page2.txt: [('-Oakland', 'Oakland')] TMM19001201-V12-12-page23.txt: [('-', '')] TMM19001201-V12-12-page3.txt: [('Spirit-', 'Spirit')] TMM19001201-V12-12-page30.txt: [('-', ''), ('-', '')] TMM19001201-V12-12-page31.txt: [('lo-', 'lo')] TMM19001201-V12-12-page34.txt: [('blast--', 'blast-')] TMM19001201-V12-12-page35.txt: [('-', ''), ('-', '')] TMM19001201-V12-12-page37.txt: [('under-', 'under')] TMM19001201-V12-12-page39.txt: [('-the', 'the')] TMM19001201-V12-12-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001201-V12-12-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001201-V12-12-page44.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001201-V12-12-page45.txt: [('-', ''), ('-', '')] TMM19001201-V12-12-page47.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001201-V12-12-page48.txt: [('-Oun', 'Oun')] TMM19001201-V12-12-page49.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19001201-V12-12-page50.txt: [('-', ''), ('-', ''), ('MIS-', 'MIS')] TMM19001201-V12-12-page51.txt: [('-', '')] TMM19001201-V12-12-page6.txt: [('an-', 'an'), ('Ad-', 'Ad'), ('-', '')] TMM19001201-V12-12-page7.txt: [('advance-', 'advance')] TMM19020101-V14-01-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-.New', '.New'), ('-', ''), ('-', ''), ('-The', 'The'), ('-', ''), ('-', ''), ('Alaska-', 'Alaska'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.Mayaguez', '.Mayaguez'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('U-', 'U')] TMM19020101-V14-01-page12.txt: [('Natal-', 'Natal')] TMM19020101-V14-01-page13.txt: [('salva-', 'salva')] TMM19020101-V14-01-page15.txt: [('re-', 're')] TMM19020101-V14-01-page16.txt: [('-', ''), ('-', '')] TMM19020101-V14-01-page17.txt: [('resur-', 'resur')] TMM19020101-V14-01-page18.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19020101-V14-01-page2.txt: [('LIFT-', 'LIFT'), ('-PRICE', 'PRICE'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('l-', 'l')] TMM19020101-V14-01-page23.txt: [('impos-', 'impos')] TMM19020101-V14-01-page27.txt: [('dis-', 'dis')] TMM19020101-V14-01-page29.txt: [('-', '')] TMM19020101-V14-01-page31.txt: [('-', ''), ('Pe-', 'Pe')] TMM19020101-V14-01-page38.txt: [('heav-', 'heav')] TMM19020101-V14-01-page49.txt: [('nec-', 'nec')] TMM19020101-V14-01-page50.txt: [('-', ''), ('SECOND-', 'SECOND'), ('MAG-', 'MAG')] TMM19020101-V14-01-page7.txt: [('grow-', 'grow')] TMM19020101-V14-01-page8.txt: [('-', ''), ('-page', 'page')] TMM19020201-V14-02-page1.txt: [('-ii', 'ii'), ('-', '')] TMM19020201-V14-02-page12.txt: [('-', ''), ('Cama-', 'Cama')] TMM19020201-V14-02-page14.txt: [('jus-', 'jus')] TMM19020201-V14-02-page15.txt: [('-', '')] TMM19020201-V14-02-page2.txt: [('-', ''), ('-', ''), ('-', '')] TMM19020201-V14-02-page21.txt: [('MISSION-', 'MISSION')] TMM19020201-V14-02-page25.txt: [('-eyes', 'eyes'), ('Ital-', 'Ital')] TMM19020201-V14-02-page33.txt: [('neces-', 'neces')] TMM19020201-V14-02-page35.txt: [('-', '')] TMM19020201-V14-02-page38.txt: [('Erken-', 'Erken')] TMM19020201-V14-02-page45.txt: [('-', '')] TMM19020201-V14-02-page46.txt: [('Okla-', 'Okla')] TMM19020201-V14-02-page47.txt: [('Fund.-', 'Fund.'), ('-', ''), ('Relief.-', 'Relief.'), ('Tithe.-', 'Tithe.'), ('Donations.-', 'Donations.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('School.-', 'School.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Field.-', 'Field.'), ('Africa.-', 'Africa.'), ('Mission.-', 'Mission.')] TMM19020201-V14-02-page49.txt: [('-', ''), ('ARIZONA.-', 'ARIZONA.'), ('CALIFORNIA.-', 'CALIFORNIA.'), ('-', ''), ('CUMBER-', 'CUMBER'), ('FLORIDA.-', 'FLORIDA.'), ('GEORGIA.-', 'GEORGIA.'), ('ILLINOIS.-', 'ILLINOIS.'), ('TERRITORY.-', 'TERRITORY.'), ('KANSAS.-', 'KANSAS.'), ('LOUISI-', 'LOUISI'), ('ANA.-', 'ANA.'), ('MIcHIGAN.-', 'MIcHIGAN.'), ('MINNESOTA.-', 'MINNESOTA.'), ('MISSOURI.-', 'MISSOURI.'), ('NE-', 'NE'), ('BRASKA.-', 'BRASKA.'), ('YORK.-', 'YORK.'), ('CAROLINA.-', 'CAROLINA.'), ('Oxio.-', 'Oxio.'), ('TERRITORY.-', 'TERRITORY.'), ('OREGON.-', 'OREGON.'), ('PENNSYLVANIA.-', 'PENNSYLVANIA.'), ('DAKOTA.-', 'DAKOTA.'), ('TEXAS.-', 'TEXAS.'), ('VERMONT.-', 'VERMONT.'), ('-', ''), ('VIRGINIA.-', 'VIRGINIA.'), ('g.-', 'g.')] TMM19020201-V14-02-page5.txt: [('-', '')] TMM19020201-V14-02-page50.txt: [('SECOND-', 'SECOND'), ('Expirations.-', 'Expirations.'), ('MAG-', 'MAG'), ('TEUTONIC-', 'TEUTONIC'), ('CELTIC-', 'CELTIC'), ('-', '')] TMM19020201-V14-02-page51.txt: [('-', '')] TMM19020201-V14-02-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('RAIL-', 'RAIL')] TMM19020201-V14-02-page7.txt: [('Holland-', 'Holland')] TMM19020201-V14-02-page8.txt: [('-', ''), ('Advent-', 'Advent')] TMM19020301-V14-03-page1.txt: [('--', '-'), ('-', ''), ('-The', 'The'), ('Hungary-', 'Hungary'), ('-', ''), ('-In', 'In'), ('L-', 'L'), ('-', ''), ('--', '-'), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Tuna-', 'Tuna'), ('-', ''), ("'Children-", "'Children")] TMM19020301-V14-03-page10.txt: [('country-', 'country'), ('prin-', 'prin')] TMM19020301-V14-03-page12.txt: [('com-', 'com')] TMM19020301-V14-03-page14.txt: [('meet-', 'meet'), ('de-', 'de')] TMM19020301-V14-03-page17.txt: [('Mon-', 'Mon')] TMM19020301-V14-03-page2.txt: [('-', ''), ('-Apply', 'Apply'), ('WathiOR-', 'WathiOR'), ('-', ''), ('-', ''), ('.-', '.')] TMM19020301-V14-03-page21.txt: [('increas-', 'increas')] TMM19020301-V14-03-page22.txt: [('-', '')] TMM19020301-V14-03-page26.txt: [('-', ''), ('appear-', 'appear'), ('-', ''), ('-miles', 'miles'), ('-', ''), ('Ar-', 'Ar')] TMM19020301-V14-03-page27.txt: [('mem-', 'mem')] TMM19020301-V14-03-page3.txt: [('RAIL-', 'RAIL'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19020301-V14-03-page31.txt: [('-', '')] TMM19020301-V14-03-page35.txt: [('propor-', 'propor')] TMM19020301-V14-03-page39.txt: [('IN-', 'IN')] TMM19020301-V14-03-page40.txt: [('-A', 'A')] TMM19020301-V14-03-page43.txt: [('-a', 'a')] TMM19020301-V14-03-page45.txt: [('-gain', 'gain'), ('dark-', 'dark')] TMM19020301-V14-03-page47.txt: [('-pressed', 'pressed')] TMM19020301-V14-03-page49.txt: [('-', '')] TMM19020301-V14-03-page5.txt: [('A-', 'A')] TMM19020301-V14-03-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19020301-V14-03-page51.txt: [('Foun-', 'Foun'), ('-', ''), ('-inch', 'inch'), ('-inch', 'inch'), ('-in.', 'in.'), ('-in.', 'in.'), ('-', '')] TMM19020301-V14-03-page52.txt: [('-', ''), ('-', '')] TMM19020301-V14-03-page7.txt: [('---', '--'), ('-.', '.'), ('-i', 'i'), ('-.', '.'), ('-----', '----'), ('-', ''), ('.-', '.'), ('----', '---'), ('-', ''), ("--'---", "-'---"), ('-', ''), ('-', ''), ('-', ''), ('f--', 'f-')] TMM19020301-V14-03-page8.txt: [('-', '')] TMM19020301-V14-03-page9.txt: [('--or', '-or')] TMM19020401-V14-04-page1.txt: [('-.', '.'), ('.CONTENTSib-', '.CONTENTSib'), ('-', ''), ('-', '')] TMM19020401-V14-04-page11.txt: [('-strong', 'strong')] TMM19020401-V14-04-page13.txt: [('-', '')] TMM19020401-V14-04-page16.txt: [('p-a--', 'p-a-'), ('-arr', 'arr'), ('-', ''), ('-e', 'e'), ('-', ''), ('-', ''), ("--C'rA", "-C'rA"), ('-X', 'X')] TMM19020401-V14-04-page2.txt: [('-', '')] TMM19020401-V14-04-page24.txt: [('re-', 're')] TMM19020401-V14-04-page3.txt: [('RAIL-', 'RAIL'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19020401-V14-04-page34.txt: [('-', ''), ('expe-', 'expe')] TMM19020401-V14-04-page43.txt: [('institu-', 'institu')] TMM19020401-V14-04-page46.txt: [('Brother-', 'Brother')] TMM19020401-V14-04-page49.txt: [('accord-', 'accord')] TMM19020401-V14-04-page50.txt: [('SECOND-', 'SECOND'), ('-', '')] TMM19020401-V14-04-page52.txt: [('-.', '.')] TMM19020401-V14-04-page7.txt: [('At-', 'At'), ('-rtitxm', 'rtitxm'), ('kk-t-', 'kk-t'), ('-', ''), ('.z-', '.z'), ('wt-', 'wt'), ('m-', 'm'), ('-', ''), ('t-', 't')] TMM19020401-V14-04-page8.txt: [('--', '-')] TMM19020501-V14-05-page1.txt: [('.-', '.'), ('-Jamaica', 'Jamaica'), ('-', ''), ('-', ''), ('-.', '.')] TMM19020501-V14-05-page10.txt: [('sta-', 'sta')] TMM19020501-V14-05-page11.txt: [('-', ''), ('suc-', 'suc')] TMM19020501-V14-05-page13.txt: [('moun-', 'moun')] TMM19020501-V14-05-page16.txt: [('-in', 'in')] TMM19020501-V14-05-page17.txt: [('mis-', 'mis')] TMM19020501-V14-05-page2.txt: [('-', ''), ('AS--', 'AS-'), ('-OUR', 'OUR'), ('-gives', 'gives')] TMM19020501-V14-05-page24.txt: [('-first-day', 'first-day'), ('-', '')] TMM19020501-V14-05-page25.txt: [('-', ''), ('-', '')] TMM19020501-V14-05-page27.txt: [('out-', 'out')] TMM19020501-V14-05-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('RAIL-', 'RAIL')] TMM19020501-V14-05-page31.txt: [('in-', 'in')] TMM19020501-V14-05-page35.txt: [('Method-', 'Method')] TMM19020501-V14-05-page36.txt: [('-', '')] TMM19020501-V14-05-page4.txt: [('-', '')] TMM19020501-V14-05-page40.txt: [('experi-', 'experi')] TMM19020501-V14-05-page42.txt: [('-us', 'us')] TMM19020501-V14-05-page43.txt: [('LATER.-', 'LATER.')] TMM19020501-V14-05-page45.txt: [('ex-', 'ex')] TMM19020501-V14-05-page46.txt: [('-', ''), ('Con-', 'Con')] TMM19020501-V14-05-page47.txt: [('-', ''), ('Relief.-', 'Relief.'), ('Sanatorium.-', 'Sanatorium.'), ('-', ''), ('Tithe.-', 'Tithe.'), ('Conference.-', 'Conference.'), ('Mission.-', 'Mission.'), ('Conference.-', 'Conference.'), ('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('Mission.-', 'Mission.'), ('Italy.-', 'Italy.'), ('-', ''), ('-', ''), ('Mission.-', 'Mission.'), ('-', ''), ('Mission.-', 'Mission.'), ('-', ''), ('-California', 'California'), ('-', ''), ('Conference.-', 'Conference.'), ('Conference.-', 'Conference.'), ('-', ''), ('-Iowa', 'Iowa'), ('-', ''), ('-Minnesota', 'Minnesota')] TMM19020501-V14-05-page48.txt: [('-', ''), ('-', ''), ('-', ''), ('Fund.-', 'Fund.'), ('-', ''), ('-', ''), ('BENEV-', 'BENEV'), ('-', '')] TMM19020501-V14-05-page5.txt: [('I-', 'I')] TMM19020501-V14-05-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')] TMM19020501-V14-05-page51.txt: [('or.-', 'or.'), ('-wl', 'wl'), ('--', '-')] TMM19020501-V14-05-page52.txt: [('-', ''), ('-', '')] TMM19020501-V14-05-page6.txt: [('key-', 'key')] TMM19020501-V14-05-page7.txt: [('MAGA-', 'MAGA')] TMM19020501-V14-05-page8.txt: [('pro-', 'pro')]
In [30]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction3 Average verified rate: 0.9839672985814993 Average of error rates: 0.023232207792207794 Total token count: 861614
In [31]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[31]:
[("'", 583), ('e', 484), ('w', 476), ('m', 341), ('t', 326), ('r', 309), ('d', 302), ('n', 298), ('f', 269), ('g', 250), ('th', 109), ('x', 75), ('co', 70), ('k', 66), ('pa', 64), ('u', 64), ('z', 61), ('mis', 42), ('io', 42), ('oc', 40), ('oo', 33), ('cc', 29), ('sionary', 29), ('re', 25), ('al', 23), ("'the", 23), ('q', 22), ('mt', 20), ('hausaland', 19), ('id', 19), ("''", 19), ('stauffer', 19), ('ary', 19), ('basle', 18), ('zo', 18), ('ft', 18), ('mo', 18), ('couva', 17), ('kalaka', 17), ('hasegawa', 17), ('sul', 17), ('okohira', 16), ('ro', 16), ('pp', 15), ('helsingfors', 15), ('sabbathschool', 15), ("hours'", 15), ('te', 15), ('schwantes', 15), ('raiatea', 15)]
Correction 4 -- Remove Extra Quotation Marks¶
In [33]:
# %load shared_elements/remove_extra_quotation_marks.py
prev = cycle
cycle = "correction4"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
corrections = []
for token in tokens:
token_list = list(token)
last_char = token_list[-1]
if last_char is "'":
if len(token) > 1:
if token_list[-2] is 's' or 'S':
pass
else:
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
elif token[0] is "'":
corrections.append((token, re.sub(r"'", r"", token)))
else:
pass
if len(corrections) > 0:
print('{}: {}'.format(filename, corrections))
for correction in corrections:
content = clean.replace_pair(correction, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980101-V10-01-page11.txt: [("'tis", 'tis')] TMM18980101-V10-01-page12.txt: [("'his", 'his')] TMM18980101-V10-01-page13.txt: [("'nough", 'nough')] TMM18980101-V10-01-page16.txt: [("'The", 'The')] TMM18980101-V10-01-page3.txt: [("'Redeemer", 'Redeemer')] TMM18980101-V10-01-page4.txt: [("'YWVP", 'YWVP')] TMM18980101-V10-01-page5.txt: [("'the", 'the'), ("'which", 'which'), ("'sphere", 'sphere')] TMM18980201-V10-02-page14.txt: [("'my", 'my')] TMM18980201-V10-02-page34.txt: [("'Contemplated", 'Contemplated')] TMM18980301-V10-03-page6.txt: [("'t", 't')] TMM18980401-V10-04-page12.txt: [("'put", 'put')] TMM18980401-V10-04-page13.txt: [("'a", 'a')] TMM18980401-V10-04-page17.txt: [("'The", 'The')] TMM18980401-V10-04-page22.txt: [("'one", 'one'), ("'of", 'of')] TMM18980401-V10-04-page31.txt: [("'s", 's'), ("'s", 's'), ("'s", 's'), ("'out", 'out')] TMM18980401-V10-04-page4.txt: [("'S", 'S')] TMM18980501-V10-05-page14.txt: [("'appreciate", 'appreciate')] TMM18980501-V10-05-page28.txt: [("'g", 'g'), ("'f", 'f')] TMM18980501-V10-05-page31.txt: [("'HYATT.", 'HYATT.')] TMM18980501-V10-05-page40.txt: [("'God", 'God')] TMM18980501-V10-05-page7.txt: [("'tis", 'tis')] TMM18980601-V10-06-page20.txt: [("'.", '.'), ("'I", 'I'), ("'t", 't'), ("'..", '..')] TMM18980601-V10-06-page25.txt: [("'Ye", 'Ye')] TMM18980601-V10-06-page26.txt: [("'Here", 'Here')] TMM18980601-V10-06-page32.txt: [("'countries.", 'countries.')] TMM18980601-V10-06-page36.txt: [("'business", 'business')] TMM18980601-V10-06-page5.txt: [("'forward", 'forward')] TMM18980701-V10-07-page23.txt: [("'some", 'some')] TMM18980701-V10-07-page25.txt: [("'the", 'the')] TMM18980701-V10-07-page38.txt: [("'.", '.')] TMM18980701-V10-07-page39.txt: [("'Signs", 'Signs')] TMM18980701-V10-07-page40.txt: [("'Signs", 'Signs')] TMM18980701-V10-07-page7.txt: [("'AlI", 'AlI')] TMM18980801-V10-08-page11.txt: [("'them", 'them')] TMM18980801-V10-08-page30.txt: [("'Come", 'Come')] TMM18980801-V10-08-page31.txt: [("'is", 'is')] TMM18980901-V10-09-page35.txt: [("'M.", 'M.')] TMM18980901-V10-09-page7.txt: [("'Praise", 'Praise'), ("'Make", 'Make'), ("'I", 'I')] TMM18981001-V10-10-page11.txt: [("'Reformation", 'Reformation'), ("'of", 'of')] TMM18981001-V10-10-page13.txt: [("'at", 'at')] TMM18981001-V10-10-page15.txt: [("'villas", 'villas')] TMM18981001-V10-10-page21.txt: [("'Salvation", 'Salvation')] TMM18981001-V10-10-page37.txt: [("'il", 'il'), ("'A", 'A')] TMM18981001-V10-10-page6.txt: [("'eternal", 'eternal')] TMM18981101-V10-11-page15.txt: [("'divine", 'divine')] TMM18981101-V10-11-page17.txt: [("'Great", 'Great')] TMM18981101-V10-11-page18.txt: [("'foreigner", 'foreigner'), ("'Corn", 'Corn'), ("'Great", 'Great')] TMM18981101-V10-11-page22.txt: [("'native", 'native')] TMM18981101-V10-11-page26.txt: [("'new", 'new')] TMM18981101-V10-11-page32.txt: [('\'Creature."', 'Creature."')] TMM18981101-V10-11-page37.txt: [("'.", '.')] TMM18981101-V10-11-page5.txt: [("'authority", 'authority')] TMM18981101-V10-11-page9.txt: [("'a", 'a')] TMM18981201-V10-12-page10.txt: [("'no", 'no')] TMM18981201-V10-12-page11.txt: [("'not", 'not')] TMM18981201-V10-12-page31.txt: [("'he", 'he')] TMM18981201-V10-12-page4.txt: [("'the", 'the')] TMM18981201-V10-12-page41.txt: [("'tis", 'tis'), ("'twill", 'twill')] TMM18981201-V10-12-page43.txt: [("'liberty", 'liberty')] TMM18981201-V10-12-page7.txt: [("'Nile", 'Nile')] TMM18990101-V11-01-page14.txt: [("'peons", 'peons'), ("'The'gold", 'Thegold'), ("'trees", 'trees')] TMM18990101-V11-01-page20.txt: [("'our", 'our')] TMM18990101-V11-01-page34.txt: [("'southeastern", 'southeastern')] TMM18990101-V11-01-page39.txt: [("'If", 'If')] TMM18990101-V11-01-page41.txt: [("'why", 'why')] TMM18990101-V11-01-page47.txt: [("'Bible", 'Bible')] TMM18990201-V11-02-page11.txt: [("'Battle", 'Battle')] TMM18990201-V11-02-page31.txt: [("'Gather", 'Gather')] TMM18990201-V11-02-page49.txt: [("'whom", 'whom')] TMM18990201-V11-02-page51.txt: [("'or", 'or')] TMM18990201-V11-02-page55.txt: [("'i", 'i'), ("'I", 'I')] TMM18990301-V11-03-page14.txt: [("'the", 'the')] TMM18990301-V11-03-page16.txt: [("'best", 'best')] TMM18990301-V11-03-page26.txt: [("'He", 'He'), ("'is", 'is')] TMM18990301-V11-03-page28.txt: [("'to", 'to')] TMM18990301-V11-03-page37.txt: [("'acquainted", 'acquainted')] TMM18990401-V11-04-page1.txt: [("'VOL.", 'VOL.')] TMM18990401-V11-04-page3.txt: [("'hands", 'hands'), ("'Of", 'Of'), ("'Mercy", 'Mercy'), ("'drawn", 'drawn'), ("'culminate", 'culminate')] TMM18990401-V11-04-page7.txt: [("'with", 'with')] TMM18990501-V11-05-page35.txt: [("'cause", 'cause')] TMM18990501-V11-05-page47.txt: [("'i", 'i')] TMM18990601-V11-06-page10.txt: [("'very", 'very')] TMM18990601-V11-06-page11.txt: [("'one", 'one'), ("'each", 'each')] TMM18990601-V11-06-page12.txt: [("'produce", 'produce')] TMM18990601-V11-06-page24.txt: [("'the", 'the')] TMM18990601-V11-06-page29.txt: [("'northwest", 'northwest')] TMM18990601-V11-06-page31.txt: [("'another", 'another')] TMM18990701-V11-07-page11.txt: [("'Central", 'Central')] TMM18990701-V11-07-page17.txt: [("'These", 'These')] TMM18990701-V11-07-page2.txt: [("'The", 'The'), ("'God", 'God')] TMM18990701-V11-07-page25.txt: [("'Church", 'Church')] TMM18990701-V11-07-page33.txt: [("'Tis", 'Tis')] TMM18990701-V11-07-page37.txt: [("'baptism", 'baptism')] TMM18990701-V11-07-page39.txt: [("'Surely", 'Surely'), ("'Why", 'Why')] TMM18990701-V11-07-page4.txt: [("'China", 'China'), ("'will", 'will')] TMM18990701-V11-07-page40.txt: [("'Germany", 'Germany')] TMM18990701-V11-07-page42.txt: [("'We", 'We')] TMM18990701-V11-07-page46.txt: [("'row", 'row')] TMM18990701-V11-07-page47.txt: [("''ettntIV", 'ettntIV')] TMM18990801-V11-08-page11.txt: [("'R.", 'R.'), ("'.", '.'), ("'ft", 'ft'), ("'.", '.'), ("'.", '.'), ("'.", '.'), ("'C", 'C'), ("'RIM", 'RIM'), ("'iOnNA", 'iOnNA'), ("'Ct.", 'Ct.'), ("'cc", 'cc'), ("'..", '..'), ("'L.", 'L.'), ("'W", 'W'), ("'Mg", 'Mg'), ("'t", 't'), ("'.", '.'), ("'Co", 'Co'), ("'lgl", 'lgl'), ("'rt", 'rt'), ("'cCc", 'cCc')] TMM18990801-V11-08-page37.txt: [("'i'Selected", 'iSelected')] TMM18990901-V11-09-page1.txt: [("'ve", 've')] TMM18990901-V11-09-page12.txt: [("'that", 'that')] TMM18990901-V11-09-page25.txt: [("'Well", 'Well')] TMM18990901-V11-09-page42.txt: [("'The", 'The')] TMM18990901-V11-09-page43.txt: [("'Send", 'Send'), ("'comprehend", 'comprehend'), ("'Neglected", 'Neglected'), ("'Here", 'Here')] TMM18991001-V11-10-page11.txt: [("'and", 'and'), ("'more", 'more')] TMM18991001-V11-10-page22.txt: [("'to", 'to')] TMM18991001-V11-10-page27.txt: [("'mission", 'mission')] TMM18991001-V11-10-page30.txt: [("'licensed", 'licensed')] TMM18991001-V11-10-page4.txt: [("'to", 'to'), ("'selected", 'selected')] TMM18991001-V11-10-page41.txt: [("'giving", 'giving')] TMM18991001-V11-10-page42.txt: [("'twelve", 'twelve')] TMM18991001-V11-10-page43.txt: [("'How", 'How')] TMM18991001-V11-10-page45.txt: [("'Stich", 'Stich')] TMM18991101-V11-11-page23.txt: [("'...", '...'), ("'Z", 'Z')] TMM18991101-V11-11-page24.txt: [("'Many", 'Many'), ("'going", 'going')] TMM18991101-V11-11-page7.txt: [("'is", 'is')] TMM18991201-V11-12-page24.txt: [("'filled", 'filled')] TMM18991201-V11-12-page28.txt: [("'presidents", 'presidents')] TMM18991201-V11-12-page31.txt: [("'Whosoever", 'Whosoever')] TMM18991201-V11-12-page40.txt: [("'summer's", 'summers')] TMM18991201-V11-12-page45.txt: [("'benefited", 'benefited')] TMM19000101-V12-01-page1.txt: [("'GREECE", 'GREECE')] TMM19000101-V12-01-page10.txt: [("'qie", 'qie'), ("'o", 'o')] TMM19000101-V12-01-page29.txt: [("'blue", 'blue')] TMM19000101-V12-01-page31.txt: [("'climbing", 'climbing')] TMM19000101-V12-01-page32.txt: [("'twere", 'twere')] TMM19000101-V12-01-page34.txt: [("'We", 'We')] TMM19000101-V12-01-page35.txt: [("'I", 'I')] TMM19000101-V12-01-page46.txt: [("'make", 'make')] TMM19000101-V12-01-page48.txt: [("'Thou", 'Thou'), ("'No", 'No')] TMM19000101-V12-01-page52.txt: [("'BILLS", 'BILLS'), ("'TRANSFER", 'TRANSFER')] TMM19000201-V12-02-page1.txt: [("'THE", 'THE')] TMM19000201-V12-02-page30.txt: [("'Shall", 'Shall')] TMM19000201-V12-02-page39.txt: [("'the", 'the')] TMM19000201-V12-02-page5.txt: [('\'"and', '"and')] TMM19000201-V12-02-page51.txt: [("'Milk", 'Milk'), ("'COTTItEri", 'COTTItEri'), ("'BrOOkbtli", 'BrOOkbtli'), ("'clean", 'clean')] TMM19000301-V12-03-page17.txt: [("'Go", 'Go')] TMM19000301-V12-03-page2.txt: [("'God", 'God')] TMM19000301-V12-03-page31.txt: [("'Well", 'Well')] TMM19000301-V12-03-page34.txt: [("'not", 'not'), ("'powers", 'powers')] TMM19000301-V12-03-page36.txt: [("'Here", 'Here')] TMM19000301-V12-03-page47.txt: [("'upon", 'upon'), ("'Come", 'Come')] TMM19000301-V12-03-page48.txt: [("'well", 'well')] TMM19000301-V12-03-page5.txt: [("'the", 'the')] TMM19000301-V12-03-page9.txt: [("''j", 'j')] TMM19000401-V12-04-page15.txt: [("'I", 'I')] TMM19000401-V12-04-page16.txt: [("'centuries", 'centuries')] TMM19000401-V12-04-page29.txt: [("'suppose", 'suppose')] TMM19000401-V12-04-page33.txt: [("'Christ", 'Christ')] TMM19000401-V12-04-page51.txt: [("'s", 's'), ("'our", 'our'), ("'Come", 'Come')] TMM19000401-V12-04-page52.txt: [("'Tis", 'Tis')] TMM19000501-V12-05-page1.txt: [("'CIRCLE", 'CIRCLE')] TMM19000501-V12-05-page2.txt: [("'lasso", 'lasso')] TMM19000501-V12-05-page32.txt: [("'Lord", 'Lord')] TMM19000501-V12-05-page39.txt: [("'time", 'time')] TMM19000501-V12-05-page42.txt: [("'Years", 'Years')] TMM19000501-V12-05-page51.txt: [("'lath", 'lath'), ("'that", 'that'), ("'si", 'si'), ("'The", 'The'), ("'because", 'because')] TMM19000501-V12-05-page52.txt: [("'Tis", 'Tis')] TMM19000501-V12-05-page6.txt: [("'Tis", 'Tis')] TMM19000601-V12-06-page1.txt: [("'A", 'A'), ("'Nassau", 'Nassau')] TMM19000601-V12-06-page11.txt: [("'Come", 'Come')] TMM19000601-V12-06-page14.txt: [("'little", 'little')] TMM19000601-V12-06-page52.txt: [("'Tis", 'Tis')] TMM19000701-V12-07-page10.txt: [("'The", 'The')] TMM19000701-V12-07-page12.txt: [("'If", 'If'), ("'Ever", 'Ever')] TMM19000701-V12-07-page13.txt: [("'of", 'of')] TMM19000701-V12-07-page46.txt: [("'Well", 'Well')] TMM19000701-V12-07-page5.txt: [("'twas", 'twas')] TMM19000701-V12-07-page52.txt: [("'Tis", 'Tis')] TMM19000701-V12-07-page7.txt: [("'ping", 'ping')] TMM19000801-V12-08-page11.txt: [("'to", 'to')] TMM19000801-V12-08-page13.txt: [("'Mohammedan", 'Mohammedan')] TMM19000801-V12-08-page2.txt: [("'mw", 'mw')] TMM19000801-V12-08-page22.txt: [("'The", 'The')] TMM19000801-V12-08-page23.txt: [("'Sufficient", 'Sufficient')] TMM19000801-V12-08-page24.txt: [("'Have", 'Have')] TMM19000801-V12-08-page29.txt: [("'luminated", 'luminated')] TMM19000801-V12-08-page36.txt: [("'In", 'In')] TMM19000801-V12-08-page41.txt: [("'Casting", 'Casting'), ("'Cast", 'Cast'), ("'Hast", 'Hast'), ("'Commit", 'Commit'), ("'are", 'are')] TMM19000801-V12-08-page42.txt: [("'Underneath", 'Underneath')] TMM19000801-V12-08-page47.txt: [("'United", 'United')] TMM19000801-V12-08-page52.txt: [("'Tis", 'Tis'), ("'enA", 'enA')] TMM19000801-V12-08-page6.txt: [("'Behold", 'Behold')] TMM19000901-V12-09-page12.txt: [("'For", 'For'), ("'The", 'The'), ("'for", 'for'), ("'For", 'For'), ("'The", 'The'), ("'For", 'For'), ("'The", 'The'), ("'Though", 'Though')] TMM19000901-V12-09-page16.txt: [("'hall", 'hall')] TMM19000901-V12-09-page43.txt: [("'These", 'These')] TMM19000901-V12-09-page45.txt: [("'T", 'T')] TMM19000901-V12-09-page49.txt: [("'is", 'is')] TMM19000901-V12-09-page52.txt: [("'for", 'for'), ("'Tis", 'Tis')] TMM19000901-V12-09-page6.txt: [("'send", 'send'), ("'missionary", 'missionary')] TMM19000901-V12-09-page7.txt: [("'missionary", 'missionary')] TMM19001001-V12-10-page1.txt: [("'PERFECT", 'PERFECT'), ("'CHINA", 'CHINA'), ("'INDEPENDENCE", 'INDEPENDENCE'), ("'I.", 'I.'), ("'LETTERS", 'LETTERS'), ("'PUBLISHED", 'PUBLISHED')] TMM19001001-V12-10-page16.txt: [("'When", 'When')] TMM19001001-V12-10-page18.txt: [("'Pearly", 'Pearly')] TMM19001001-V12-10-page19.txt: [("'way", 'way'), ("'Old", 'Old'), ("'All", 'All'), ("'Reason", 'Reason')] TMM19001001-V12-10-page49.txt: [("'The", 'The'), ("'are", 'are'), ("'knows", 'knows')] TMM19001001-V12-10-page51.txt: [("'York.", 'York.')] TMM19001001-V12-10-page52.txt: [("'Tie", 'Tie'), ("'Latest", 'Latest')] TMM19001101-V12-11-page16.txt: [("'the", 'the')] TMM19001101-V12-11-page18.txt: [("'professional", 'professional')] TMM19001101-V12-11-page47.txt: [("'Why", 'Why')] TMM19001101-V12-11-page51.txt: [("'roof", 'roof'), ("'Brooklyn", 'Brooklyn'), ("'without", 'without'), ("'for", 'for'), ("'under", 'under'), ("'Venial..", 'Venial..')] TMM19001101-V12-11-page52.txt: [("'Seventh", 'Seventh'), ("'fine", 'fine'), ("'not", 'not')] TMM19001101-V12-11-page6.txt: [("'he", 'he'), ("'Cast", 'Cast'), ("'Cast", 'Cast')] TMM19001201-V12-12-page14.txt: [("'la", 'la')] TMM19001201-V12-12-page2.txt: [("'UNDERWOOD", 'UNDERWOOD')] TMM19001201-V12-12-page22.txt: [("'house", 'house')] TMM19001201-V12-12-page28.txt: [("'events.", 'events.')] TMM19001201-V12-12-page3.txt: [("'i", 'i')] TMM19001201-V12-12-page49.txt: [("'Volunteer", 'Volunteer')] TMM19001201-V12-12-page51.txt: [("'York.", 'York.')] TMM19001201-V12-12-page52.txt: [("'Tie", 'Tie')] TMM19001201-V12-12-page9.txt: [("'as", 'as')] TMM19020101-V14-01-page10.txt: [("'onechapter", 'onechapter')] TMM19020101-V14-01-page13.txt: [("'neath", 'neath'), ("'Tis", 'Tis'), ("'Tis", 'Tis')] TMM19020101-V14-01-page2.txt: [("'N", 'N')] TMM19020101-V14-01-page34.txt: [("'disease.", 'disease.')] TMM19020201-V14-02-page37.txt: [('\'"', '"')] TMM19020201-V14-02-page40.txt: [("'verse", 'verse')] TMM19020201-V14-02-page42.txt: [("'at", 'at')] TMM19020201-V14-02-page49.txt: [("'Jo", 'Jo')] TMM19020201-V14-02-page50.txt: [("'act", 'act'), ("'For", 'For')] TMM19020201-V14-02-page51.txt: [("'stigmatized", 'stigmatized')] TMM19020201-V14-02-page52.txt: [("'Scenery", 'Scenery')] TMM19020301-V14-03-page1.txt: [("'The", 'The'), ("'illustrated", 'illustrated'), ("'Japan", 'Japan'), ("'Among", 'Among'), ("'.", '.'), ("'MISSION", 'MISSION'), ("'.", '.'), ("'The", 'The'), ("'.", '.'), ("'Children", 'Children')] TMM19020301-V14-03-page10.txt: [("'will", 'will')] TMM19020301-V14-03-page2.txt: [("'ckroutes", 'ckroutes'), ("'.", '.'), ("'it", 'it'), ("'ST", 'ST'), ("'CIMAGO", 'CIMAGO'), ("'State", 'State')] TMM19020301-V14-03-page25.txt: [("'quarantine", 'quarantine')] TMM19020301-V14-03-page26.txt: [("'distant.", 'distant.')] TMM19020301-V14-03-page40.txt: [("'has", 'has')] TMM19020301-V14-03-page42.txt: [("'diet", 'diet')] TMM19020301-V14-03-page44.txt: [("'and", 'and')] TMM19020301-V14-03-page8.txt: [("'patient.", 'patient.')] TMM19020401-V14-04-page12.txt: [("'effigy", 'effigy')] TMM19020401-V14-04-page15.txt: [("'Tis", 'Tis')] TMM19020401-V14-04-page19.txt: [("'the", 'the')] TMM19020401-V14-04-page39.txt: [("'Therefore", 'Therefore')] TMM19020401-V14-04-page46.txt: [("'down", 'down')] TMM19020401-V14-04-page51.txt: [("'Tremont", 'Tremont')] TMM19020401-V14-04-page52.txt: [("'OW", 'OW')] TMM19020501-V14-05-page14.txt: [("'had", 'had')] TMM19020501-V14-05-page2.txt: [("'CABINET", 'CABINET')] TMM19020501-V14-05-page24.txt: [("'from", 'from')] TMM19020501-V14-05-page31.txt: [("'But", 'But')] TMM19020501-V14-05-page32.txt: [("'only", 'only')] TMM19020501-V14-05-page37.txt: [("'come", 'come')] TMM19020501-V14-05-page51.txt: [("'.", '.'), ("'....", '....'), ("'details", 'details')] TMM19020501-V14-05-page6.txt: [("'love.", 'love.')]
In [36]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction4 Average verified rate: 0.9845119517865869 Average of error rates: 0.022646753246753245 Total token count: 861503
In [37]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[37]:
[('e', 485), ('w', 476), ("'", 466), ('t', 344), ('m', 343), ('r', 313), ('d', 303), ('n', 302), ('f', 271), ('g', 254), ('th', 110), ('x', 76), ('co', 70), ('k', 68), ('pa', 64), ('u', 64), ('z', 63), ('mis', 42), ('io', 42), ('oc', 40), ('oo', 33), ('cc', 31), ('sionary', 29), ('re', 25), ('al', 23), ('q', 22), ('mt', 20), ('hausaland', 19), ('id', 19), ('ft', 19), ('stauffer', 19), ('ary', 19), ('basle', 18), ('zo', 18), ('mo', 18), ('couva', 17), ('kalaka', 17), ('hasegawa', 17), ('sul', 17), ('okohira', 16), ('ro', 16), ('pp', 15), ('helsingfors', 15), ('sabbathschool', 15), ("hours'", 15), ('te', 15), ('schwantes', 15), ('raiatea', 15), ('wm', 15), ('ioo', 14)]
Correction 5 -- Rejoin Split Words¶
In [39]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction5"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980101-V10-01-page1.txt: [('Mis', 'SIONARY')] TMM18980101-V10-01-page31.txt: [('ro', 'of')] TMM18980101-V10-01-page4.txt: [('fil', 'A')] TMM18980201-V10-02-page37.txt: [('MIS', 'SION'), ('QUAR', 'TER')] TMM18980201-V10-02-page38.txt: [('zo', 'o')] TMM18980201-V10-02-page39.txt: [('SECRE', 'TARIES')] TMM18980201-V10-02-page9.txt: [('Ning', 'po')] TMM18980301-V10-03-page24.txt: [('Mis', 'SIONARY')] TMM18980301-V10-03-page25.txt: [('es', 'd'), ('mi', 'o'), ('por', 'no'), ('Aqui', 'no')] TMM18980301-V10-03-page39.txt: [('Mis', 'SION')] TMM18980301-V10-03-page6.txt: [('Fi', 'le')] TMM18980301-V10-03-page8.txt: [('Mc', 'Carthy')] TMM18980401-V10-04-page26.txt: [('HISTOR', 'ICAL')] TMM18980401-V10-04-page3.txt: [('G.', '')] TMM18980401-V10-04-page36.txt: [('pais', 'a')] TMM18980401-V10-04-page7.txt: [("KING'", 'S')] TMM18980501-V10-05-page33.txt: [("God'", 's')] TMM18980501-V10-05-page35.txt: [('MIS', 'SION'), ('QUAR', 'TER')] TMM18980501-V10-05-page39.txt: [('re', 'leased')] TMM18980601-V10-06-page16.txt: [('sr', 'A')] TMM18980801-V10-08-page35.txt: [('MIS', 'SION'), ('QUAR', 'TER')] TMM18980801-V10-08-page38.txt: [('MIS', 'SIONARY')] TMM18980901-V10-09-page17.txt: [('wh', 'en')] TMM18981001-V10-10-page28.txt: [('Previo', 'us')] TMM18981101-V10-11-page17.txt: [('MIS', 'SIONARY')] TMM18981101-V10-11-page20.txt: [('Tien', 'Tsin')] TMM18981101-V10-11-page25.txt: [('MAGA', 'ZINE')] TMM18981101-V10-11-page36.txt: [('SOCI', 'ETY')] TMM18981101-V10-11-page37.txt: [('MAGA', 'ZINE')] TMM18981101-V10-11-page38.txt: [('MAGA', 'ZINE')] TMM18981201-V10-12-page41.txt: [('wa', 'y'), ('MC', 'CARTHY')] TMM18990101-V11-01-page14.txt: [('es', 'Pecially')] TMM18990101-V11-01-page29.txt: [('MIS', 'SIONARY')] TMM18990101-V11-01-page47.txt: [('PHILADEL', 'PHIA')] TMM18990101-V11-01-page48.txt: [('repre', 'sentative')] TMM18990201-V11-02-page14.txt: [("Angel'", 's')] TMM18990201-V11-02-page52.txt: [('MIS', 'SION'), ('QUAR', 'TER')] TMM18990201-V11-02-page54.txt: [('PHILADEL', 'PHIA')] TMM18990301-V11-03-page11.txt: [('tri', 'weekly')] TMM18990301-V11-03-page34.txt: [('G.', '')] TMM18990301-V11-03-page38.txt: [('PHILADEL', 'PHIA')] TMM18990401-V11-04-page36.txt: [('th', 'or')] TMM18990401-V11-04-page38.txt: [('PHILADEL', 'PHIA'), ('Indo', 'China')] TMM18990401-V11-04-page39.txt: [('repre', 'sentative')] TMM18990501-V11-05-page42.txt: [('MIS', 'SION'), ('QUAR', 'TER')] TMM18990501-V11-05-page46.txt: [('PHILADEL', 'PHIA')] TMM18990601-V11-06-page46.txt: [('PHILADEL', 'PHIA')] TMM18990701-V11-07-page26.txt: [('civiliz', 'ation')] TMM18990701-V11-07-page27.txt: [('so-', 'called')] TMM18990701-V11-07-page41.txt: [('longsuffer', 'ing')] TMM18990701-V11-07-page46.txt: [('MIS', 'SIONARY')] TMM18990801-V11-08-page11.txt: [('PC', 't'), ('al', 'i'), ('CA', 'W'), ('RI', 'a'), ('TE', 'R'), ('ato', 'N'), ('re', 'C'), ('ma', 'I'), ('te', 'a'), ('tAl', 'a'), ('JU', 'N'), ('Ele', 'a'), ('EV', 'I'), ('CI', 'T')] TMM18990801-V11-08-page22.txt: [('IL', 'A')] TMM18990801-V11-08-page34.txt: [('Philadel', 'phia')] TMM18990801-V11-08-page45.txt: [('MIS', 'SION')] TMM18990801-V11-08-page46.txt: [('MIS', 'SIONARY')] TMM18990901-V11-09-page34.txt: [('co', 'laborer')] TMM18990901-V11-09-page35.txt: [('mis', 'sionary')] TMM18990901-V11-09-page46.txt: [('MIS', 'SIONARY')] TMM18991001-V11-10-page17.txt: [('Mc', 'Carthy')] TMM18991001-V11-10-page46.txt: [('MIS', 'SIONARY')] TMM18991101-V11-11-page37.txt: [('MAGA', 'ZINE')] TMM18991101-V11-11-page42.txt: [('MIS', 'SION')] TMM18991101-V11-11-page44.txt: [('Indo', 'China')] TMM18991101-V11-11-page46.txt: [('MIS', 'SIONARY'), ('Superin', 'tendent')] TMM18991201-V11-12-page21.txt: [('MC', 'CARTHY')] TMM18991201-V11-12-page31.txt: [('th', 'at')] TMM18991201-V11-12-page38.txt: [('RE', 'V')] TMM18991201-V11-12-page40.txt: [('Nebuchadn', "ezzar's")] TMM18991201-V11-12-page46.txt: [('MIS', 'SIONARY')] TMM19000101-V12-01-page10.txt: [('re', 'a')] TMM19000101-V12-01-page37.txt: [('MAGA', 'ZINE')] TMM19000101-V12-01-page44.txt: [('MIS', 'SIONARY')] TMM19000101-V12-01-page47.txt: [('MAGA', 'ZINE')] TMM19000101-V12-01-page50.txt: [('MIS', 'SIONARY'), ('MAGA', 'ZINE')] TMM19000101-V12-01-page51.txt: [('reci', 'pes')] TMM19000101-V12-01-page6.txt: [('un', 'INTELLIGENT')] TMM19000201-V12-02-page36.txt: [('corre', 'sponding')] TMM19000201-V12-02-page40.txt: [('Indo', 'China'), ('EM', 'It')] TMM19000201-V12-02-page46.txt: [('MIS', 'SION')] TMM19000201-V12-02-page50.txt: [('MIS', 'SIONARY')] TMM19000201-V12-02-page51.txt: [('TRE', 'S')] TMM19000201-V12-02-page52.txt: [('ig', 'n')] TMM19000301-V12-03-page39.txt: [('pre', 'arrangement')] TMM19000301-V12-03-page46.txt: [('ti', 'to')] TMM19000401-V12-04-page50.txt: [('MIS', 'SIONARY')] TMM19000401-V12-04-page51.txt: [('WA', 'RDS')] TMM19000501-V12-05-page18.txt: [('CA', 'VINESS')] TMM19000501-V12-05-page22.txt: [('re', 'no')] TMM19000501-V12-05-page45.txt: [('MIS', 'SION')] TMM19000501-V12-05-page50.txt: [('MIS', 'SIONARY')] TMM19000501-V12-05-page52.txt: [('Li', 'Q')] TMM19000601-V12-06-page21.txt: [('Ju', 'n')] TMM19000601-V12-06-page45.txt: [('RE', 'VIEW')] TMM19000601-V12-06-page49.txt: [('infor', 'mation')] TMM19000601-V12-06-page50.txt: [('MIS', 'SIONARY')] TMM19000701-V12-07-page40.txt: [('exac', 'test')] TMM19000701-V12-07-page50.txt: [('MIS', 'SIONARY'), ('regula', 'rly')] TMM19000701-V12-07-page52.txt: [('SIMPLICIT', 'Y')] TMM19000801-V12-08-page12.txt: [('wh', 'o')] TMM19000801-V12-08-page44.txt: [('MIS', 'SION')] TMM19000801-V12-08-page45.txt: [('MA', 'TABELE')] TMM19000801-V12-08-page47.txt: [('Tien', 'Tsin')] TMM19000801-V12-08-page50.txt: [('MIS', 'SIONARY')] TMM19000801-V12-08-page51.txt: [('VESTIBU', 'LED')] TMM19000801-V12-08-page52.txt: [('SIMPLIC', 'ITY')] TMM19000901-V12-09-page32.txt: [('MC', 'CARTHY')] TMM19000901-V12-09-page5.txt: [('MC', 'CARTHY')] TMM19000901-V12-09-page50.txt: [('MIS', 'SIONARY')] TMM19001001-V12-10-page20.txt: [('MC', 'CARTHY')] TMM19001001-V12-10-page38.txt: [('studen', 'ts')] TMM19001001-V12-10-page44.txt: [('re', 'ct')] TMM19001001-V12-10-page50.txt: [('MIS', 'SIONARY')] TMM19001101-V12-11-page28.txt: [('MC', 'CARTHY')] TMM19001101-V12-11-page44.txt: [('MIS', 'SION')] TMM19001101-V12-11-page45.txt: [('RARATONG', 'A')] TMM19001101-V12-11-page47.txt: [('fel', 'lows')] TMM19001101-V12-11-page50.txt: [('MIS', 'SIONARY')] TMM19001101-V12-11-page51.txt: [('BEW', 'ARE'), ('re', 'Price')] TMM19001201-V12-12-page11.txt: [('magnif', 'icent')] TMM19001201-V12-12-page18.txt: [('MC', 'CARTHY')] TMM19001201-V12-12-page2.txt: [('co', 'mpany')] TMM19001201-V12-12-page31.txt: [('wa', 's')] TMM19001201-V12-12-page35.txt: [('re', 'reading')] TMM19001201-V12-12-page47.txt: [('Ro', 'man')] TMM19001201-V12-12-page50.txt: [('MIS', 'SIONARY')] TMM19020101-V14-01-page43.txt: [('Guadalaj', 'ara')] TMM19020201-V14-02-page17.txt: [('unf', 'allen')] TMM19020201-V14-02-page33.txt: [('th', 'or')] TMM19020201-V14-02-page48.txt: [('ASSO', 'CIATION')] TMM19020201-V14-02-page49.txt: [('LOUISI', 'ANA'), ('NE', 'BRASKA')] TMM19020201-V14-02-page8.txt: [('Tien', 'Tsin')] TMM19020301-V14-03-page1.txt: [('EDITORI', 'AL')] TMM19020301-V14-03-page16.txt: [('MC', 'CARTHY')] TMM19020301-V14-03-page26.txt: [('th', 'o')] TMM19020401-V14-04-page16.txt: [('pA', 'L')] TMM19020401-V14-04-page47.txt: [('Sul', 'a')] TMM19020401-V14-04-page52.txt: [('ma', 'M')] TMM19020501-V14-05-page1.txt: [('TH', 'E')] TMM19020501-V14-05-page11.txt: [('suc', 'cess')] TMM19020501-V14-05-page2.txt: [('GA', 'g')] TMM19020501-V14-05-page48.txt: [('oc', 'H'), ('BENEV', 'OLENT')] TMM19020501-V14-05-page7.txt: [('MAGA', 'ZINE')]
In [42]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction5 Average verified rate: 0.984760022894214 Average of error rates: 0.022267532467532464 Total token count: 861353
In [43]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[43]:
[('e', 484), ('w', 476), ("'", 466), ('m', 342), ('t', 342), ('r', 312), ('d', 302), ('n', 300), ('f', 271), ('g', 254), ('th', 107), ('x', 76), ('co', 69), ('k', 68), ('u', 64), ('pa', 63), ('z', 63), ('io', 42), ('oc', 40), ('oo', 33), ('cc', 31), ('al', 21), ('q', 21), ('mt', 20), ('re', 20), ('hausaland', 19), ('id', 19), ('ary', 19), ('ft', 19), ('stauffer', 19), ('zo', 18), ('basle', 18), ('mo', 18), ('couva', 17), ('kalaka', 17), ('hasegawa', 17), ('sul', 17), ('okohira', 16), ('helsingfors', 15), ('pp', 15), ('sabbathschool', 15), ("hours'", 15), ('schwantes', 15), ('raiatea', 15), ('wm', 15), ('ro', 15), ('ioo', 14), ('seventhday', 14), ('ic', 14), ('te', 14)]
Correction 6 -- Rejoin Split Words II¶
In [45]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"
directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
os.makedirs(directories['cycle'])
corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))
for filename in corpus:
content = utilities.readfile(directories['prev'], filename)
text = re.sub(r"[0-9,!?$:;&]", " ", content)
tokens = utilities.tokenize_text(text)
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
if len(replacements) > 0:
print('{}: {}'.format(filename, replacements))
for replacement in replacements:
content = clean.replace_split_words(replacement, content)
else:
pass
with open(join(directories['cycle'], filename), mode="w") as o:
o.write(content)
o.close()
TMM18980201-V10-02-page37.txt: [('V', 'ermont')] TMM18980301-V10-03-page25.txt: [('No', 'se')] TMM18980301-V10-03-page39.txt: [('MisSION', 'ARY')] TMM18980501-V10-05-page33.txt: [('whole', 'heartedness')] TMM18980601-V10-06-page16.txt: [('r', 'OW'), ('the', 'Re')] TMM18980901-V10-09-page5.txt: [('sub', 'terranean')] TMM18981001-V10-10-page28.txt: [('w', 'ork')] TMM18981201-V10-12-page42.txt: [('u', 'tA')] TMM18981201-V10-12-page45.txt: [('Character', 'istic')] TMM18990101-V11-01-page14.txt: [('es', 'Pecially')] TMM18990101-V11-01-page47.txt: [('MISSION', 'ARY')] TMM18990101-V11-01-page48.txt: [('repre', 'sentative')] TMM18990201-V11-02-page45.txt: [('o', 'ffer')] TMM18990201-V11-02-page54.txt: [('MISSION', 'ARY')] TMM18990301-V11-03-page34.txt: [('Num', 'ber')] TMM18990301-V11-03-page38.txt: [('MISSION', 'ARY')] TMM18990301-V11-03-page9.txt: [('r', 'oth')] TMM18990401-V11-04-page38.txt: [('MISSION', 'ARY')] TMM18990401-V11-04-page39.txt: [('repre', 'sentative')] TMM18990501-V11-05-page22.txt: [('req', 'uirement')] TMM18990501-V11-05-page26.txt: [('in', 'ti')] TMM18990501-V11-05-page46.txt: [('MISSION', 'ARY')] TMM18990601-V11-06-page46.txt: [('MISSION', 'ARY')] TMM18990701-V11-07-page17.txt: [('a', 'ny')] TMM18990701-V11-07-page5.txt: [('gov', "ernor's")] TMM18990801-V11-08-page11.txt: [('a', 'tt'), ('t', 'il'), ('s', 'gt'), ('at', 'co'), ('a', 'te'), ('a', 'RIZ')] TMM18990801-V11-08-page20.txt: [('p', 'ork')] TMM18990901-V11-09-page3.txt: [('MISSION', 'ARY')] TMM18991001-V11-10-page37.txt: [('a', 're')] TMM18991001-V11-10-page46.txt: [('an', 'swers')] TMM18991201-V11-12-page29.txt: [('ha', 've')] TMM18991201-V11-12-page40.txt: [('Nebuchadn', "ezzar's")] TMM19000101-V12-01-page43.txt: [('a', 'nd')] TMM19000101-V12-01-page47.txt: [('MISSION', 'ARY')] TMM19000101-V12-01-page52.txt: [('DEVELOP', 'MENT')] TMM19000201-V12-02-page29.txt: [('a', 'nd')] TMM19000201-V12-02-page36.txt: [('corre', 'sponding')] TMM19000201-V12-02-page49.txt: [('Miss', 'IONARY')] TMM19000201-V12-02-page51.txt: [('Mission', 'arY')] TMM19000301-V12-03-page42.txt: [('my', 'thology')] TMM19000301-V12-03-page45.txt: [('hard', 'ly')] TMM19000301-V12-03-page8.txt: [('the', 'Ta')] TMM19000401-V12-04-page25.txt: [('car', 'ried')] TMM19000501-V12-05-page32.txt: [('MISSION', 'ARY')] TMM19000501-V12-05-page33.txt: [('wonder', 'ful')] TMM19000501-V12-05-page5.txt: [('second', 'ary')] TMM19000601-V12-06-page10.txt: [('a', 'li')] TMM19000601-V12-06-page11.txt: [('be', 'ng')] TMM19000601-V12-06-page52.txt: [('the', 'Remin')] TMM19000801-V12-08-page34.txt: [('to', 'Shiba')] TMM19000801-V12-08-page5.txt: [('cent', 'uries')] TMM19000901-V12-09-page51.txt: [('con', 'nection')] TMM19001001-V12-10-page31.txt: [('a', 'nd')] TMM19001001-V12-10-page44.txt: [('re', 'ct'), ('a', 're')] TMM19001001-V12-10-page52.txt: [('A', 'IL')] TMM19001001-V12-10-page6.txt: [('Am', 'alekites')] TMM19001001-V12-10-page8.txt: [('MISSION', 'ARY')] TMM19001101-V12-11-page51.txt: [('P', 'hiladelphia')] TMM19001201-V12-12-page2.txt: [('co', 'mpany')] TMM19001201-V12-12-page23.txt: [('k', 'eeping')] TMM19020201-V14-02-page21.txt: [('MISSION', 'ARY')] TMM19020201-V14-02-page31.txt: [('Aguas', 'Calientes')] TMM19020201-V14-02-page48.txt: [('Bell', 'oc')] TMM19020201-V14-02-page52.txt: [('CEN', 'TRAL'), ('R', 'IP')] TMM19020301-V14-03-page2.txt: [('B', 'RA')] TMM19020301-V14-03-page3.txt: [('CEN', 'TRAL')] TMM19020301-V14-03-page34.txt: [('con', 'verts')] TMM19020401-V14-04-page3.txt: [('CEN', 'TRAL')] TMM19020501-V14-05-page17.txt: [('the', 'mis')] TMM19020501-V14-05-page2.txt: [('a', 'GA')] TMM19020501-V14-05-page3.txt: [('CEN', 'TRAL'), ('E', 'xcursion')] TMM19020501-V14-05-page8.txt: [('pro', 'tection')]
In [48]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction6 Average verified rate: 0.9848273395829028 Average of error rates: 0.02219012987012987 Total token count: 861286
In [49]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[49]:
[('e', 484), ('w', 475), ("'", 466), ('m', 342), ('t', 342), ('r', 310), ('d', 302), ('n', 300), ('f', 271), ('g', 254), ('th', 107), ('x', 76), ('co', 68), ('k', 67), ('u', 64), ('pa', 63), ('z', 63), ('io', 42), ('oc', 40), ('oo', 33), ('cc', 31), ('al', 21), ('q', 21), ('mt', 20), ('hausaland', 19), ('id', 19), ('ft', 19), ('stauffer', 19), ('zo', 18), ('basle', 18), ('mo', 18), ('couva', 17), ('kalaka', 17), ('hasegawa', 17), ('sul', 17), ('re', 17), ('okohira', 16), ('helsingfors', 15), ('pp', 15), ('sabbathschool', 15), ("hours'", 15), ('schwantes', 15), ('raiatea', 15), ('wm', 15), ('ro', 15), ('ioo', 14), ('seventhday', 14), ('ic', 14), ("''", 13), ('te', 13)]
Review Remaining Errors¶
In [50]:
reports.docs_with_high_error_rate(summary)
Out[50]:
[('TMM18980701-V10-07-page42.txt', 1.0), ('TMM18990201-V11-02-page10.txt', 0.9), ('TMM19000301-V12-03-page9.txt', 0.614), ('TMM18980101-V10-01-page4.txt', 0.605), ('TMM18991101-V11-11-page23.txt', 0.534), ('TMM18980301-V10-03-page25.txt', 0.517), ('TMM18990801-V11-08-page11.txt', 0.512), ('TMM18980401-V10-04-page4.txt', 0.5), ('TMM18990301-V11-03-page17.txt', 0.5), ('TMM18990701-V11-07-page10.txt', 0.5), ('TMM18990401-V11-04-page4.txt', 0.5), ('TMM18980301-V10-03-page6.txt', 0.449), ('TMM18980601-V10-06-page20.txt', 0.448), ('TMM19020301-V14-03-page2.txt', 0.363), ('TMM18980501-V10-05-page28.txt', 0.341), ('TMM18980301-V10-03-page18.txt', 0.333), ('TMM18980301-V10-03-page10.txt', 0.333), ('TMM18990501-V11-05-page48.txt', 0.333), ('TMM18990801-V11-08-page48.txt', 0.321), ('TMM19001001-V12-10-page1.txt', 0.317), ('TMM19000601-V12-06-page48.txt', 0.302), ('TMM19000101-V12-01-page10.txt', 0.291), ('TMM18990101-V11-01-page48.txt', 0.263), ('TMM18981201-V10-12-page20.txt', 0.25), ('TMM19000501-V12-05-page4.txt', 0.25), ('TMM18990601-V11-06-page48.txt', 0.25), ('TMM18980701-V10-07-page38.txt', 0.235), ('TMM18980201-V10-02-page13.txt', 0.219), ('TMM19020501-V14-05-page51.txt', 0.213)]
In [52]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.3]
utilities.open_original_docs(doc_keys, directories['cycle'])
Opened files: TMM18980701-V10-07-page42.txt TMM18990201-V11-02-page10.txt TMM19000301-V12-03-page9.txt TMM18980101-V10-01-page4.txt TMM18991101-V11-11-page23.txt TMM18980301-V10-03-page25.txt TMM18990801-V11-08-page11.txt TMM18980401-V10-04-page4.txt TMM18990301-V11-03-page17.txt TMM18990701-V11-07-page10.txt TMM18990401-V11-04-page4.txt TMM18980301-V10-03-page6.txt TMM18980601-V10-06-page20.txt TMM19020301-V14-03-page2.txt TMM18980501-V10-05-page28.txt TMM18980301-V10-03-page18.txt TMM18980301-V10-03-page10.txt TMM18990501-V11-05-page48.txt TMM18990801-V11-08-page48.txt TMM19001001-V12-10-page1.txt TMM19000601-V12-06-page48.txt
Most of the high error documents match the usual pattern of maps, images, and charts. One interesting exception is "TMM18980301-V10-03-page25.txt", which is in Spanish. I examined the original OCR and there were no accent marks that were lost during normalizing.
In [55]:
reports.long_errors(errors_summary, min_length=15)
Out[55]:
(['austria-hungaria', 'scripture-sabbath', 'gospel-commission', 'elevatedrailroad', 'newly-established', 'darjeeling-above', 'spanish-speaking', 'soul-and-body-destroying', 'into-insignificance', 'heaven-descended', 'hastily-organized', 'greatgrandparents', "'globetrottings'", 'hard-heartedness', 'stivkimikarkaaagiaiwatkaaiiiikiiiikit', 'scene-guadalajara', 'self-commendation', 'apoitleshipbeitring', 'interestinebible', 'fourteenyear-old', 'mamouret-ul-aziz', 'nezdterrerwiethe', 'cigarette-papers', 'pylrlitigeltrlile', 'self-denyingfollowers', 'joinherinherlabors', 'artificially-made', "controversy'''among", 'out-stations--one', 'long-experienced', 'intelligent-looking', 'milkailkiiticimallikillitcattikilit', 'spanish-american', 'otherispanish-speaking', 'charity-begins-at-home', 'self-complacency', "looks'upon-their", 'inexactconformity', 'self-aggrandizement', 'daughters-in-law', 'fourthsabbathexercise', 'sonderburg-glucksburg', 'christianfarmers', 'waterloo-jamaica', 'thickly-timbered', 'fire-worshippers', 'frontispiece-mamma', 'blood-corpuscles', 'literally-fulfiled', 'erichermerchantsintheusualway', 'kaailikarkalikaii', 'nrinfimparirlittlawawlit', 'appropropriately', 'innocent-looking', 'vapolreidzeerrewniteh', 'inspector-general', 'chinese-japanese', 'accountabilities', 'tamtatikivnityleysa', 'pricedinitrzements', 'pleasure-seekers', "sabbath-keeper's", 'lifrimmiiimenspirmiivinfillir', 'governorgenerals', 'ersacizaznovovar', 'ereationaniagara', 'trial--freeesendme', 'printing-presses', 'sasiiiiisnamiximinbegpeemnize', 'avcitosivivocktickpeptv', 'calvary-redeemed', 'rifirmtairiiiitliww', 'self-forgetfulness', 'civilizationexists', 'nieswaynorkadvocateofworld', 'more-than-one-halffinger-long', 'ronoliilgichinese', 'buildingresembling', 'ofwhose-pronunciation', 'tlimmoutrlosillummultrm', 'christ-followers', 'orikakipkokyikartikawaavaikaaiiikaiiikaitio', 'lengthandquality', 'fircaraitisttiattkiisikaikaikatwatekattit', 'sp-anish-speaking', 'doppelschraubenpostdampfer', 'panama-hat-is-aproduetion-of', 'christianity-its', 'narrow-mindedness', 'ilhaillimillkillitilliiraliirailhallikakiiilkilitilikiiilki', 'intienregstdxperience', 'semi-independent', 'sugar-plantation', 'self-propagation', 'buluwayo-zambesia', 'swedish-speaking', 'simple-mindedness', 'anti-progressive', 'rotilezdtftervtee', 'christianizingthe', 'frontispiece-harbor', 'attentiontshould', 'infaluableutenaill', 'broad-shouldered', 'pcmammariscloist', 'italian-speaking', 'health-restoring', 'long-established', 'sixteenprovinces', 'learningsomething', 'mievivimmeirinfirfa', 'recently-developed', 'non-commissioned', 'church-fellowship', 'english-japanese', "stauffer'szletter", 'germanicthoroughlyy', 'doppelsehraubenpostdampfer', 'church-membership', 'anidrgceontfaolutnsd', 'japanese-english', 'commandment-keeping', 'well-proportioned', 'nativity-interior', 'french-switzerland', 'powakikrilifwvfairarlit', 'american-spanish', 'ttttttttttttttttts', 'frontispiece-thatched', 'tfirinfargiiralt', 'subscriptionsshould', 'poorly-furnished', 'self-established', 'alexandria-troas', 'irillillkillrallikillbilibilirrillralillkillullitilllikillp', 'coffee-producing', 'artistically-built', 'boarding-schools', 'fanning-machines', 'amphitheatershaped', 'wthetreyovuaneotrsierpyiyot', 'kindergarten-school', 'fellow-passenger', 'gorgeously-arrayed', 'generalconference', 'hopelessicondition', 'sabbath-breaking', 'ratilikalattidir', 'accommodation-houses', 'fitthstliamascit', 'earthquake-visited', 'ositviivtablebbk', 'interspersedhere', 'brilliantly-lighted', 'fellow-countrymen', 'poverty-stricken', 'ethical-political', 'light-complexioned', 'commander-in-chief', 'chimney-blackened', 'thrashing-machine', 'tkilikarstatatamtitaiwookiiiikakotarkit', 'governor-general', 'anti-footbinding', 'alexandria-troas--had', 'firfinifwillirlrilliirilirlitilitalrerwlik'], 15)
In [ ]: