TMM-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "TMM"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/baseline

Average verified rate: 0.9620184818421186

Average of error rates: 0.048324675324675326

Total token count: 870476

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 30 )
Out[11]:
[('-', 1111),
 ("'", 540),
 ('¥', 474),
 ('w', 467),
 ('e', 451),
 ('ñ', 412),
 (')', 345),
 ('m', 331),
 ('n', 290),
 ('r', 290),
 ('t', 285),
 ('d', 273),
 ('con-', 259),
 ('f', 239),
 ('g', 237),
 ('re-', 221),
 ('tion', 198),
 ('mis-', 160),
 ('in-', 147),
 ('*', 145),
 ('(', 128),
 ('com-', 117),
 ('ñthe', 107),
 ('th', 106),
 ('be-', 105),
 ('¡', 91),
 ('de-', 87),
 ('sionary', 87),
 ('mission-', 82),
 ('ment', 78),
 ('ex-', 76),
 ('ary', 70),
 ('tions', 69),
 ('co', 68),
 ('x', 67),
 ('pa', 63),
 ('k', 62),
 ('en-', 61),
 ('u', 61),
 ('[illustrated]', 60),
 ('+', 60),
 ('per-', 59),
 ('pro-', 58),
 ('/', 58),
 ('z', 56),
 ('dis-', 53),
 ('ple', 51),
 ('peo-', 49),
 ('(the', 49),
 ('pre-', 48),
 ('ers', 47),
 ('an-', 46),
 ('un-', 46),
 ('ad-', 43),
 ('ence', 42),
 ('ñwe', 41),
 ('(illustrated)', 41),
 ('io', 41),
 ('oc', 40),
 ('_', 40),
 ('ber', 39),
 ('inter-', 39),
 ('ã', 39),
 ('ac-', 38),
 ('for-', 38),
 ('to-', 37),
 ('meet-', 37),
 ('im-', 36),
 ('thou-', 35),
 ('can-', 34),
 (']', 33),
 ('ña', 32),
 ('oo', 32),
 ('mes-', 31),
 ('al-', 31)]

Check Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)
Out[12]:
[('¥', 474),
 ('ñ', 412),
 (')', 345),
 ('*', 145),
 ('(', 128),
 ('ñthe', 107),
 ('¡', 91),
 ('+', 60),
 ('[illustrated]', 60),
 ('/', 58),
 ('(the', 49),
 ('ñwe', 41),
 ('(illustrated)', 41),
 ('_', 40),
 ('ã', 39),
 (']', 33),
 ('ña', 32),
 ('ñall', 29),
 ('`', 27),
 ('(b)', 26),
 ('ñin', 26),
 ('(c)', 26),
 ('(a)', 26),
 ('(d)', 23),
 ('=', 23),
 ('ñit', 22),
 ('ñsubscriptions', 22),
 ('%', 22),
 ('ñelder', 21),
 ('departmentñ', 21),
 ('(a', 21),
 ('(march)', 20),
 ('ô', 19),
 ('¥¥', 19),
 ('(to', 19),
 ('ñan', 18),
 ('(see', 17),
 ('(for', 17),
 ('(april)', 16),
 ('(which', 16),
 ('ñand', 16),
 ('(e)', 16),
 ('(may)', 15),
 ('ñone', 15),
 ('\\', 15),
 ('(poem)', 14),
 ('£', 14),
 ('(and', 14),
 ('(including', 13),
 ('(as', 13),
 ('(in', 13),
 ('[see', 13),
 ('ñno', 12),
 ('(f)', 12),
 ('¥¥¥', 12),
 ('ñto', 12),
 ('holland)', 12),
 ('ñid', 12),
 ('(incorporated', 12),
 ('(concluded', 11),
 ('ñtest', 11),
 ('>', 11),
 ('(or', 11),
 ('[poem]', 10),
 ('(continued', 10),
 ('+contents+', 10),
 ('ñmrs', 10),
 ('ñjune', 9),
 ('(acts', 8),
 ('(j', 8),
 ('ñjuly', 8),
 ('ñbrother', 8),
 ('ñi', 8),
 ('(january)', 8),
 ('¥¥¥¥¥¥¥¥', 7),
 ('ñhe', 7),
 ('(fourth', 7),
 ('[second', 7),
 ('ñfrom', 7),
 ('(june)', 7),
 ('christianityñits', 7),
 ('ñour', 7),
 ('ñmay', 7),
 ('<', 7),
 ('-¥', 7),
 ('ñspecial', 7),
 ('ñfebruary', 6),
 ('ñjohn', 6),
 ('(august)', 6),
 ('(july)', 6),
 ('ñwhen', 6),
 ('reading)', 6),
 ('[fourth', 6),
 ('ñfacts', 6),
 ('ñjust', 6),
 ('ñapril', 6),
 ('(i', 6),
 ('reading]', 6),
 ('(december', 6),
 ('ñdr', 6),
 ('the¥', 6),
 ('ñmarch', 5),
 ("'¥", 5),
 ('(if', 5),
 ('ñnot', 5),
 ('û', 5),
 ('(dan', 5),
 ('ñsince', 5),
 ('\ufeff', 5),
 ('%x', 5),
 ('ñseptember', 5),
 ('ñhome', 5),
 ('them)', 5),
 ('[', 5),
 ('world)', 5),
 ('++', 5),
 ('readingñsabbath', 5),
 ('it)', 5),
 ('(there', 5),
 ('(september)', 5),
 ('land)', 5),
 ('(rom', 5),
 ('~~', 5),
 ('(i)', 5),
 ('(from', 5),
 ('ñthat', 5),
 ('(g)', 4),
 ('(april', 4),
 ('cenñ', 4),
 ('[the', 4),
 ('{', 4),
 ('f\x90te-day', 4),
 ('ñrev', 4),
 ('ñbecause', 4),
 ('`i', 4),
 ('(rev', 4),
 ('ñmr', 4),
 ('(march', 4),
 ('(work', 4),
 ('(with', 4),
 ('school)', 4),
 ('ñdecember', 4),
 ('service]', 4),
 ('people)', 4),
 ('(of', 4),
 ('(verse', 4),
 ('ñon', 4),
 ('(oregon)', 4),
 ('darjeelingñabove', 4),
 ('¥¥¥¥', 4),
 ('ñheb', 4),
 ("¥'", 4),
 ('ñprofessor', 4),
 ('river)', 4),
 ('ñabout', 4),
 ('ñmatt', 4),
 ('ñduring', 4),
 ('ñjanuary', 4),
 ('(isa', 4),
 ('(february', 4),
 ('weeks)', 3),
 ('¥¥¥¥¥', 3),
 ('(building', 3),
 ('natives)', 3),
 ('_-', 3),
 ('mission)', 3),
 ('workñit', 3),
 ('(nearly', 3),
 ('lapps)', 3),
 ('year)', 3),
 ('¥v', 3),
 ('^', 3),
 ('ñevery', 3),
 ('a¥nd', 3),
 ('o)', 3),
 ('(that', 3),
 ('workñ', 3),
 ('ñmissionary', 3),
 ('(r', 3),
 ('water)', 3),
 ('))', 3),
 ('called)', 3),
 ('*the', 3),
 ('organ)', 3),
 ('-(', 3),
 ('}', 3),
 ('cooked)', 3),
 ('(it', 3),
 ('i)', 3),
 ('ñsel', 3),
 ('oolooberiaña', 3),
 ('[*', 3),
 ('ñas', 3),
 ('themñand', 3),
 ('ñoctober', 3),
 ('ñlast', 3),
 ('(called', 3),
 ('fund)', 3),
 ('¥-¥', 3),
 ('¥i', 3),
 ('may)', 3),
 ('(not', 3),
 ('ñisa', 3),
 ('(one', 3),
 ('ñs', 3),
 ('ñat', 3),
 ('ñthis', 3),
 ('(denmark', 3),
 ('ñpart', 3),
 ('exercise]', 3),
 ('(december)', 3),
 ('watchwordñ', 3),
 ('ñyes', 3),
 ('ñthere', 3),
 ('ñfor', 3),
 ('(john', 3),
 ('¥t', 3),
 ('are)', 3),
 ('±', 3),
 ('[illustrated', 3),
 ('brazilñ', 3),
 ('(kansas)', 2),
 ('(ind', 2),
 ('(most', 2),
 ('✓', 2),
 ('(meaning', 2),
 ('ñtwo', 2),
 ('r)', 2),
 ('ñgospel', 2),
 ('package)', 2),
 ('//', 2),
 ('ñspurgeon', 2),
 ('ñnew', 2),
 ('(six', 2),
 ('ago)', 2),
 ('ñgod', 2),
 ("+'", 2),
 ('departme\\t', 2),
 ('oneñthat', 2),
 ('ñselected', 2),
 ('s/', 2),
 ('_a', 2),
 ('ñso', 2),
 ('here)', 2),
 ('(though', 2),
 ('ö', 2),
 ('g%', 2),
 ('¥¥¥¥¥¥¥¥¥¥', 2),
 ('(about', 2),
 ('(ad', 2),
 ('(mark', 2),
 ('ruary)', 2),
 ('humanityñto', 2),
 ('spirit)', 2),
 ('¥the', 2),
 ('stãpittsburg', 2),
 ("¡'", 2),
 ('(f', 2),
 ('daughter)', 2),
 ('f\x90te', 2),
 ('missio\\ary', 2),
 ('(religious', 2),
 ('hallña', 2),
 ('only)', 2),
 ('(all', 2),
 ('ñyea', 2),
 ('ñletters', 2),
 ('/t', 2),
 ('¥s', 2),
 ('ñseveral', 2),
 ('christñthe', 2),
 ('*ghest', 2),
 ('(dutch', 2),
 ('troubleñwhether', 2),
 (')v', 2),
 ('`<', 2),
 ('ñother', 2),
 ('day)', 2),
 ('ñbut', 2),
 ('¥%', 2),
 ('times)', 2),
 ('(t', 2),
 ('ñr', 2),
 ('children)', 2),
 ('raceñthe', 2),
 ('(february)', 2),
 ('ioo¡', 2),
 ('cñtwin', 2),
 ('(christ)', 2),
 ('worldñis', 2),
 ('(trinidad)', 2),
 ('ñword', 2),
 ('week)', 2),
 ('(in-', 2),
 ('ñthey', 2),
 ('ñdesire', 2),
 ('ç', 2),
 ('colony)', 2),
 ('plata)', 2),
 ('(margin)', 2),
 ('time)', 2),
 ('gospelñthe', 2),
 ('the_', 2),
 ('(now', 2),
 ('church)', 2),
 ('¥new', 2),
 ('old)', 2),
 ('ñsome', 2),
 ('#', 2),
 ('ñwith', 2),
 ('english)', 2),
 ('ñchristian', 2),
 ('(but', 2),
 ('(southern)', 2),
 ('(denmark)', 2),
 ('t*', 2),
 ('(on', 2),
 ('states)', 2),
 ('ñof', 2),
 ('town)', 2),
 ('caf\x8es', 2),
 ('¥e', 2),
 ('churchñthe', 2),
 ('cut)', 2),
 ('worldñto', 2),
 ('nomñthe', 2),
 ('ig*', 2),
 ('(they', 2),
 ('ñif', 2),
 ('(miss', 2),
 ('ñpsalm', 2),
 ('april)', 2),
 ('(feb-', 2),
 ('exampleñthe', 2),
 ('ñtee', 2),
 ('ñerratum', 2),
 ('verseñ', 2),
 ('magazine)', 2),
 ('(january', 2),
 ('(holy', 2),
 ('¥well', 2),
 ('one¥', 2),
 ('¥he', 2),
 ('specialñthe', 2),
 ('fool)', 2),
 ('peopleñi', 2),
 ('(concluded)', 2),
 ('ñphillips', 2),
 ('feet)', 2),
 ('ñby', 2),
 ('(signs', 2),
 ('ý', 2),
 ('ñstudent', 2),
 ('z¥', 2),
 ('c¥', 2),
 ('ñis', 2),
 ('a¡', 2),
 ('[in', 2),
 ('ñspiritual', 2),
 ('parts)', 2),
 ('citiesñand', 2),
 ('(verses', 2),
 ('allñ', 2),
 ('beñthe', 2),
 ('-`', 2),
 ('e¥', 2),
 ('a¥re', 2),
 ('(coolies)', 2),
 ('jews)', 2),
 ('(an', 2),
 ('(sometimes', 2),
 ('ñhis', 2),
 ('ñluke', 2),
 ('i¥', 2),
 ('bay)', 2),
 ('ñdo', 2),
 ('cheetstãphiladelphia', 2),
 ('(generally', 2),
 ('ñout', 2),
 ('life)', 2),
 ('themñthe', 2),
 ('(thoroughly', 2),
 (')+', 2),
 ('erectedñone', 2),
 ('/-', 2),
 ('do)', 2),
 ('our¥', 2),
 ('(light', 2),
 ('(revelation', 2),
 ('godñnot', 2),
 ('(may', 2),
 ('[a', 2),
 ('(gal', 2),
 ('sayñand', 2),
 ('ñreview', 2),
 ('(later', 2),
 ('(michigan)', 2),
 ('**', 2),
 ('days)', 2),
 ('manña', 2),
 ('map)', 2),
 ('man)', 2),
 ('mallettñdear', 2),
 ('***', 2),
 ('``', 2),
 ('ñmy', 2),
 ('(god)', 2),
 ('ary)', 2),
 ('(alabama)', 2),
 ('[to', 2),
 ('(little', 2),
 ('(fig', 2),
 ('victoriañbut', 1),
 ('healthfully/', 1),
 ('thisñdishonesty', 1),
 ('(entre', 1),
 ('first)', 1),
 ('ñthree', 1),
 ('importa]', 1),
 ('partnerñnow', 1),
 ('_t_h/so', 1),
 ('ôof', 1),
 ('ñsucce', 1),
 ('churchñit', 1),
 ('menñthe', 1),
 ('body)', 1),
 ('(church)', 1),
 ('englandñto', 1),
 ('solitudeñthe', 1),
 ('(yang-tse', 1),
 ('f¥', 1),
 ('restñfor', 1),
 ('presentñperhaps', 1),
 ('preparationsñnot', 1),
 ('thingñfor', 1),
 ('ñaugust', 1),
 ('examination)', 1),
 ('possessionsñall', 1),
 ('*t', 1),
 ('societyñ', 1),
 ('yetñis', 1),
 ('dollarsñnine', 1),
 ("r'r%", 1),
 ('[orang', 1),
 ('kittsñthey', 1),
 ('text=bookñnovember', 1),
 ('(leap', 1),
 ('_enjoys', 1),
 ('/l', 1),
 ('millionsñone-third', 1),
 ('gatherings)', 1),
 ('cornñmealiesñis', 1),
 ('v/', 1),
 ('enciesñgrand', 1),
 ('power¥', 1),
 ('weaknessñ', 1),
 ('(thena', 1),
 ('winterñall', 1),
 ('ñlet', 1),
 ('myselfñduring', 1),
 ('on(', 1),
 ('hregardingv/', 1),
 ('floorsñthough', 1),
 ('first¥', 1),
 ('vationñhis', 1),
 ('encouraged)', 1),
 ('monstersñthe', 1),
 ('tonñunexcelled', 1),
 ('(wakenaam)', 1),
 ("tea'ã'is", 1),
 ('usñsend', 1),
 ('ôtis', 1),
 ('¥a', 1),
 ('worldñextends', 1),
 ('ñmost', 1),
 ('gu`', 1),
 ('vaticanñthe', 1),
 ('(cow', 1),
 ('*since', 1),
 ('argentina_', 1),
 ('ñg', 1),
 ('lotñit', 1),
 ('(j)', 1),
 ('countryñi', 1),
 ('slavesñcaptives', 1),
 ('stampsñamounting', 1),
 ('a#', 1),
 ('images)', 1),
 ('roomsñone', 1),
 ('messageñcaptain', 1),
 ('(sabbath-', 1),
 ('bondsñthese', 1),
 ('lakes)', 1),
 ('proml_tly', 1),
 ('saleñor', 1),
 ('continued)', 1),
 ('sister¥', 1),
 ('(helsingfors)', 1),
 ('(local', 1),
 ('about¥', 1),
 ('miiiiim=', 1),
 ('slightñfrom', 1),
 ('christiansñwe', 1),
 ('islesñst', 1),
 ('king¥', 1),
 ('w(', 1),
 ('(lao-tsze', 1),
 ('amo\\g', 1),
 ('(o', 1),
 ('toolñso', 1),
 ('fraternityñwhen', 1),
 ('messageña', 1),
 ('electro=hydropathic', 1),
 ('womenñcome', 1),
 ('(jamaica', 1),
 ('feverñ', 1),
 ('floor)', 1),
 ('demandñability', 1),
 ('knowledgeñthe', 1),
 ('studyñthe', 1),
 ('dayñso', 1),
 ('that)', 1),
 ('benedictionñelder', 1),
 ('ministersñall', 1),
 ('philippinesñbishop', 1),
 ('rageñall', 1),
 ('ãit', 1),
 ('familyñhe', 1),
 ('ñent', 1),
 ('continentñafrica', 1),
 ('this¥', 1),
 ('libertyñpolit-', 1),
 ('\\j', 1),
 ('doctorñassistant', 1),
 ('(adventist', 1),
 ('tal)', 1),
 ('sionaries)', 1),
 ('`voorlooper', 1),
 ('sideñand', 1),
 ('ñseven', 1),
 ('ñdifferent', 1),
 ('j¥', 1),
 ('actsñin', 1),
 ('ñmaybe', 1),
 ("'(", 1),
 ('c)', 1),
 ('ñsowing', 1),
 ('`there', 1),
 ('ground)', 1),
 ('letterñtwo', 1),
 ('biscuit)', 1),
 ('committeeñthat', 1),
 ('`kc', 1),
 ('classñthose', 1),
 ('nursesñwe', 1),
 ('ñpaul', 1),
 ('macheteñthe', 1),
 ('t`', 1),
 ('ñjesus', 1),
 ('a-*/**/¥', 1),
 ("^'cottiteri", 1),
 ('additionto_abont', 1),
 ('soulñ', 1),
 ('viewñabsolutely', 1),
 ('on/daniel', 1),
 ('/ft', 1),
 ('cattleñsecond', 1),
 ('(only', 1),
 ('fatallyñthe', 1),
 ('directionñgo', 1),
 ('arthur)', 1),
 ('instructionsñ', 1),
 ('dampña', 1),
 ('motherhoodñcannot', 1),
 ('biographyñthe', 1),
 ('¥+r', 1),
 (')l', 1),
 ('goingñ', 1),
 ('itñthe', 1),
 ('insectsñthey', 1),
 ('orderñthe', 1),
 ('millionñrussia', 1),
 ('kilaueañprobably', 1),
 ('countriesñan', 1),
 ('(speaking', 1),
 ('*henever', 1),
 ('countriesñguaranteed', 1),
 ('superiorñ', 1),
 ('=mummy', 1),
 ('cityñand', 1),
 ('ã-_', 1),
 ('¥what', 1),
 ('message)', 1),
 ('a^or', 1),
 ('breadfruitña', 1),
 ('encouragingñbut', 1),
 ('fieldsñit', 1),
 ('winterñalways', 1),
 ('crosses)', 1),
 ('mules)', 1),
 ('ñcanon', 1),
 ('againñ', 1),
 ('doneñwhen', 1),
 ('(at', 1),
 ('ancestorsñfor', 1),
 ('(continental', 1),
 ('boundñby', 1),
 ("-%'", 1),
 ('¥interest', 1),
 ('%-', 1),
 ('ñanoust', 1),
 ('¥professor', 1),
 ('destructionñbecause', 1),
 ('paper*', 1),
 ('ñplainly', 1),
 ('goodñwe', 1),
 ('australiañstellenbosch', 1),
 ('the/', 1),
 ('believers)', 1),
 ('fel¥', 1),
 ('car_', 1),
 ('slavesñslaves', 1),
 ('kingña', 1),
 ('`commanded', 1),
 ('billowsñall', 1),
 ('-/', 1),
 ('himñmay', 1),
 ('(apartment', 1),
 ('ct)', 1),
 ('agoutiña', 1),
 ('(fields)', 1),
 ('margin)', 1),
 ('numeralsñthe', 1),
 ('tiv(ptilst', 1),
 ('indiañ', 1),
 ('alaska)', 1),
 ('lazmig[', 1),
 ('inches=', 1),
 ('timeñthat', 1),
 ('(phil', 1),
 ('¥but', 1),
 ('eatñalthough', 1),
 ('missionary`', 1),
 ('hzinû', 1),
 ('(col', 1),
 ('__', 1),
 ('himselfñwithout', 1),
 ('[food]', 1),
 ('racesñkafir', 1),
 ('climate)', 1),
 ('crocodile)', 1),
 ('ñlaces', 1),
 ('mapsñno', 1),
 ('ourselvesñhere', 1),
 ('`lo', 1),
 ('dutyñthat', 1),
 ('weekñdecember', 1),
 ('stateñbolivia', 1),
 ('laile¥city', 1),
 ('exceptions)', 1),
 ('nameaa„ss❑', 1),
 ('colorsñthey', 1),
 ('(caravansary)', 1),
 ('marriageñher', 1),
 ('harborñsaid', 1),
 ('writeñeven', 1),
 ('¡s', 1),
 ('months_', 1),
 ('i¥¥', 1),
 ('thought)', 1),
 ('malesñwere', 1),
 ('w(`', 1),
 ('oppositionñwere', 1),
 ('-an/', 1),
 ('(mich', 1),
 ('patonñthat', 1),
 ('heardñsublimer', 1),
 ('(when', 1),
 ('ñhaving', 1),
 ('augustñin', 1),
 ('formerñthey', 1),
 ('brownñ', 1),
 ('hospitableñwilling', 1),
 ('soldiersñwhat', 1),
 ('actlt`', 1),
 ('\\i', 1),
 ('(kwi)', 1),
 ('spainñlonged', 1),
 ('baptizedñone', 1),
 ('¥sasnoh', 1),
 ('doneñ', 1),
 ('^ids', 1),
 ('seekñgo', 1),
 ("_masse'", 1),
 ('missionaryñat', 1),
 ('asiañtheir', 1),
 ('familyñthe', 1),
 ('carriage¥road', 1),
 ('(after', 1),
 ('earn=', 1),
 ('ôô`', 1),
 ('(freedom', 1),
 ('tk%', 1),
 ('certainñ', 1),
 ('selfñof', 1),
 ('(very', 1),
 ('obi-women)', 1),
 ('iiiiii=viii', 1),
 ('(their', 1),
 ('ñeducation', 1),
 ('(servants)', 1),
 ('ñbooker', 1),
 ('text-bookñ', 1),
 ('ñsmith', 1),
 ('wellñtime', 1),
 ('agesñi', 1),
 ('hulañperformed', 1),
 ('``yellow', 1),
 ('<¥', 1),
 ('countriesñfrance', 1),
 ('`and', 1),
 ('cruzñaside', 1),
 ('ropeñthe', 1),
 ('especiallyñand', 1),
 ('groundñperhaps', 1),
 ("ã'it", 1),
 ('spanish)', 1),
 ('*igit', 1),
 ('neighborsñone', 1),
 ('ic)', 1),
 ('theeñpray', 1),
 ('saved)', 1),
 ('¡god', 1),
 ('peaksñpopocatepetl', 1),
 ('ill_', 1),
 ('ñtestimonies', 1),
 ('charcoalñand', 1),
 ("(')", 1),
 ('ñgermany', 1),
 ('policyñhe', 1),
 ('rabbitñsupposing', 1),
 ('philip>', 1),
 ("ñkerr's", 1),
 ('weekñ', 1),
 ('possessionñwhere', 1),
 ('ñten', 1),
 ('texasñthe', 1),
 ('¥every', 1),
 ('desireñright', 1),
 ('loveñi', 1),
 ('%¥', 1),
 ('othersña', 1),
 ('(naini', 1),
 ('lifeñunto', 1),
 ('badñas', 1),
 ('republic)', 1),
 ('¥of', 1),
 ('macheteña', 1),
 ('(moravian)', 1),
 ('handsñmore', 1),
 ('papersñcopies', 1),
 ('ñdoes', 1),
 ('usñso', 1),
 ('fieldsñi', 1),
 ('stationñthe', 1),
 ('(mule-drivers)', 1),
 ('ñjanuanv', 1),
 ('styleñby', 1),
 ('christñit', 1),
 ('m)', 1),
 ('*presenting', 1),
 ('planñ', 1),
 ('babylon)', 1),
 ('ñmarca', 1),
 ('tokenñof', 1),
 ('loveñdie', 1),
 ('falls)', 1),
 ('antilles)', 1),
 ('loebsack)', 1),
 ('trvtk`t', 1),
 ('ñjoshua', 1),
 ('comeñ', 1),
 ('islandsñ', 1),
 ('stãboston', 1),
 ('levuñ', 1),
 ('¥chicago', 1),
 ('argentineñwill', 1),
 ('faithñnot', 1),
 ('ãtoward', 1),
 ('farmsñonly', 1),
 ('california¥', 1),
 ('nationsñall', 1),
 ('accommodated)', 1),
 ('guageñ', 1),
 ('fast-daysñdays', 1),
 ('someñthey', 1),
 ('bationñthe', 1),
 ('has_not', 1),
 ('chinañeven', 1),
 ('(d', 1),
 ('understandñfor', 1),
 ('complainingñonly', 1),
 ('-_-', 1),
 ('mail)', 1),
 ('creatureñ', 1),
 ("'lgl`", 1),
 ('mother-in-lawñthere', 1),
 ('¥be', 1),
 ('part)', 1),
 ('(zech', 1),
 ('so/apper', 1),
 ('«iay', 1),
 ('quartñmuch', 1),
 ('(ex', 1),
 ('ñtan', 1),
 ('enoughñto', 1),
 ('a)', 1),
 ('strangersñbut', 1),
 ('c+p', 1),
 ("ta'*", 1),
 ('destinationñcaravellasñinquire', 1),
 ('ñwomen', 1),
 ('worshipñsun', 1),
 ('iã', 1),
 ('liveñthey', 1),
 ('log)', 1),
 ('mexico)', 1),
 ('necessaries_', 1),
 ('olanchoñsavannas', 1),
 ('matabelelandñ', 1),
 ('soft`', 1),
 ('hong()', 1),
 ('tub*', 1),
 ('ñmark', 1),
 ('prisonña', 1),
 ('companyñ', 1),
 ('\\ad-', 1),
 ('chineseñhigh', 1),
 ('historyñso', 1),
 ('livingstoneñthe', 1),
 ('f`', 1),
 ('kv*mk', 1),
 ('=-¥', 1),
 ('classñthe', 1),
 ('*****', 1),
 ('coveringñonly', 1),
 ('=lead', 1),
 ('strugglesñthat', 1),
 ('countryñreceived', 1),
 ('enunciatedñtruths', 1),
 ('(mr', 1),
 ('descriptionñthe', 1),
 ('menñmen', 1),
 ('aspectñ', 1),
 ('destinationña', 1),
 ('rã', 1),
 ('torch)', 1),
 ('frontikl**-i', 1),
 ('thingsñand', 1),
 ('themñi', 1),
 ('t)', 1),
 ('four¥', 1),
 ('exerciseñnovember', 1),
 ('placeñtreating', 1),
 ('usñthat', 1),
 ('understand)', 1),
 ('tune)', 1),
 ('chinese¥', 1),
 ('salvadorñthe', 1),
 ('—segari', 1),
 ('peopleñthere', 1),
 ('wayñ', 1),
 ('groundñthat', 1),
 ('¤elf-governing', 1),
 ('doorsñthe', 1),
 ('monthly)', 1),
 ('_thee', 1),
 ('beñ', 1),
 ('(three', 1),
 ('beliefsñsome', 1),
 ('furnitureñ', 1),
 ('gardenñplaces', 1),
 ('(patience)', 1),
 ('yards)', 1),
 ('expression)', 1),
 ('godñeven', 1),
 ('denseñ', 1),
 ('whichñkusaie', 1),
 ('kindsñand', 1),
 ('yearly)', 1),
 ('truthñ', 1),
 ('coloradoñ', 1),
 ('ñoun', 1),
 ('mals)', 1),
 ('sir)', 1),
 ('aljna/-', 1),
 ('seañas', 1),
 ('gloomñthe', 1),
 ('itself)', 1),
 ('eaten)', 1),
 ('degradingñit', 1),
 ('*elder', 1),
 ('g¥', 1),
 ('ó', 1),
 ('christian)', 1),
 ('personal)', 1),
 ('medicineñand', 1),
 ('yearñever', 1),
 ('[prague]', 1),
 ('laborersñmr', 1),
 ("\\'i", 1),
 ('¥¥¥-¥', 1),
 ('worldñthe', 1),
 ('leftñwe', 1),
 ('illiterateñthe', 1),
 ('(satisfied)', 1),
 ('ir_', 1),
 ('v¥a', 1),
 ('switzerlandñhong', 1),
 ('jerusalemñthey', 1),
 ('neckñtonsilitis', 1),
 ('ñbefore', 1),
 ('uresñchanges', 1),
 ('ñeducational', 1),
 ('(n)', 1),
 ('(rum)', 1),
 ('to-o)', 1),
 ('creoles)', 1),
 ('blackñabout', 1),
 ('iû', 1),
 ('ñam', 1),
 ('fieldsñwhether', 1),
 ('lifô', 1),
 ('(his', 1),
 ('glad_', 1),
 ('(rome)', 1),
 ('floorñand', 1),
 ('¥in', 1),
 ('_that', 1),
 ('(unless', 1),
 ('aitutakians)', 1),
 ('pestñthe', 1),
 ('abroadñthe', 1),
 ('widowhoodñall', 1),
 ('switzerlandñbulu-', 1),
 ('(grave-', 1),
 ('restore¥', 1),
 ("(+'", 1),
 ('a_llc', 1),
 ('sliogunateñso', 1),
 ('laborer)', 1),
 ('wordñto', 1),
 ('(gospel', 1),
 ('viveritt/', 1),
 ('(almost', 1),
 ('-*', 1),
 ('(beneath', 1),
 ('ginñmore', 1),
 ('wasñjesus', 1),
 ("`surveying'", 1),
 ('(hot', 1),
 ('grythyttehedñnoted', 1),
 ('taotaisñ', 1),
 ('sectsñthe', 1),
 ('journeyñand', 1),
 ('exerciseñoctober', 1),
 ('a¥', 1),
 ('amphitheaterñto', 1),
 ('milesñmore', 1),
 ('(iowa)', 1),
 ('personsñnatives', 1),
 ('(joppa', 1),
 ('gospelñat', 1),
 ('(embraces)', 1),
 ('settlementñwhich', 1),
 ('voyageñ', 1),
 ('pôco', 1),
 ('operations¥', 1),
 ('ñreading', 1),
 ('/tis', 1),
 ('ñex-', 1),
 ('sceneryñand', 1),
 ('worldñif', 1),
 ('gehenna)', 1),
 ('natalñgeneva', 1),
 ('t#tnr=ligt', 1),
 ('steamerñthree', 1),
 ('groundñeven', 1),
 ('(some', 1),
 ('aristocraticñthe', 1),
 ('ñcyrus', 1),
 ('*isi', 1),
 ('valleyñ', 1),
 ('ginzañthe', 1),
 ('[when', 1),
 ('to-morrowñwhile', 1),
 ('door)', 1),
 ('-***', 1),
 ('truthñlearn', 1),
 ('journeyñto', 1),
 ...]

Correction 1 -- Normalize Characters

In [14]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction1

Average verified rate: 0.9689343941867684

Average of error rates: 0.03858441558441559

Total token count: 869740

In [19]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[19]:
[('-', 1170),
 ("'", 573),
 ('e', 479),
 ('w', 475),
 ('m', 337),
 ('t', 314),
 ('r', 307),
 ('d', 301),
 ('n', 295),
 ('f', 268),
 ('con-', 259),
 ('g', 250),
 ('re-', 222),
 ('tion', 198),
 ('mis-', 161),
 ('in-', 149),
 ('com-', 117),
 ('th', 109),
 ('be-', 105),
 ('de-', 87),
 ('sionary', 87),
 ('mission-', 82),
 ('ment', 78),
 ('ex-', 77),
 ('ary', 74),
 ('x', 72),
 ('co', 70),
 ('tions', 69),
 ('u', 63),
 ('pa', 63),
 ('k', 63),
 ('en-', 61),
 ('per-', 59),
 ('pro-', 58),
 ('z', 58),
 ('dis-', 53),
 ('ple', 51),
 ('peo-', 49),
 ('pre-', 48),
 ('ers', 47),
 ('un-', 46),
 ('an-', 46),
 ('ad-', 44),
 ('ence', 42),
 ('io', 42),
 ('oc', 40),
 ('ber', 40),
 ('inter-', 39),
 ('for-', 38),
 ('ac-', 38)]

Correction 2 -- Correct Line Endings

In [21]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [24]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction2

Average verified rate: 0.9817024929526814

Average of error rates: 0.02631636363636364

Total token count: 862030

In [25]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[25]:
[('-', 1157),
 ("'", 573),
 ('e', 479),
 ('w', 475),
 ('m', 337),
 ('t', 312),
 ('r', 305),
 ('d', 301),
 ('n', 295),
 ('f', 267),
 ('g', 250),
 ('th', 109),
 ('x', 72),
 ('co', 69),
 ('pa', 63),
 ('k', 63),
 ('u', 63),
 ('z', 58),
 ('io', 42),
 ('oc', 40),
 ('mis-', 39),
 ('oo', 33),
 ('cc', 29),
 ('sionary', 29),
 ('--', 28),
 ('money-order', 24),
 ("'the", 23),
 ('q', 21),
 ('al', 21),
 ('mt', 20),
 ('ary', 19),
 ('id', 19),
 ('spanish-speaking', 19),
 ('hausaland', 19),
 ("''", 19),
 ('stauffer', 19),
 ('ft', 18),
 ('mo', 18),
 ('zo', 18),
 ('basle', 18),
 ('re', 18),
 ('hasegawa', 17),
 ('couva', 17),
 ('kalaka', 17),
 ('-the', 17),
 ('sul', 17),
 ('okohira', 16),
 ('ro', 16),
 ('sabbathschool', 15),
 ('pp', 15)]

Correction 3 -- Remove Extra Dashes

In [27]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
TMM18980101-V10-01-page1.txt: [('Mis-', 'Mis')]
TMM18980101-V10-01-page12.txt: [('-one', 'one'), ('-brought', 'brought'), ('-their', 'their'), ('-worship', 'worship')]
TMM18980101-V10-01-page13.txt: [('-appears', 'appears')]
TMM18980101-V10-01-page14.txt: [('-was', 'was')]
TMM18980101-V10-01-page15.txt: [('-Baptists', 'Baptists')]
TMM18980101-V10-01-page23.txt: [('respond-', 'respond')]
TMM18980101-V10-01-page26.txt: [('Waterloo-', 'Waterloo'), ('-Jamaica.', 'Jamaica.'), ('-by', 'by')]
TMM18980101-V10-01-page28.txt: [('-this', 'this')]
TMM18980101-V10-01-page32.txt: [('-WE', 'WE')]
TMM18980101-V10-01-page4.txt: [('--a', '-a'), ('-', ''), ('-rse', 'rse'), ('-', '')]
TMM18980101-V10-01-page9.txt: [('ene-', 'ene')]
TMM18980201-V10-02-page11.txt: [('---', '--')]
TMM18980201-V10-02-page13.txt: [('-K', 'K'), ('-N', 'N'), ('AricuN-', 'AricuN'), ('-', ''), ('-', '')]
TMM18980201-V10-02-page14.txt: [('Anglo-', 'Anglo'), ('-too', 'too')]
TMM18980201-V10-02-page17.txt: [('-miles', 'miles'), ('op-', 'op')]
TMM18980201-V10-02-page22.txt: [('prom-', 'prom')]
TMM18980201-V10-02-page27.txt: [('-', '')]
TMM18980201-V10-02-page32.txt: [('-', '')]
TMM18980201-V10-02-page33.txt: [('CON-', 'CON'), ('-', '')]
TMM18980201-V10-02-page35.txt: [('liter-', 'liter')]
TMM18980201-V10-02-page37.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('.-', '.')]
TMM18980201-V10-02-page38.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM18980201-V10-02-page39.txt: [('SECRE-', 'SECRE'), ('Mts-', 'Mts')]
TMM18980201-V10-02-page6.txt: [('-', '')]
TMM18980201-V10-02-page7.txt: [('Nes-', 'Nes')]
TMM18980201-V10-02-page9.txt: [('-', ''), ('-', '')]
TMM18980301-V10-03-page12.txt: [('Gar-', 'Gar')]
TMM18980301-V10-03-page16.txt: [('-', '')]
TMM18980301-V10-03-page19.txt: [('-', '')]
TMM18980301-V10-03-page24.txt: [('Mis-', 'Mis')]
TMM18980301-V10-03-page25.txt: [('O-----', 'O----'), ('-reveladas', 'reveladas'), ('galar-', 'galar')]
TMM18980301-V10-03-page28.txt: [('com-', 'com')]
TMM18980301-V10-03-page31.txt: [('-be', 'be'), ('-work', 'work')]
TMM18980301-V10-03-page32.txt: [('---g.', '--g.'), ('-P.', 'P.'), ('-krka', 'krka')]
TMM18980301-V10-03-page37.txt: [('estab-', 'estab'), ('Jan-', 'Jan')]
TMM18980301-V10-03-page38.txt: [('-', '')]
TMM18980301-V10-03-page39.txt: [('Mis-', 'Mis'), ('Mis-', 'Mis'), ('-', '')]
TMM18980301-V10-03-page5.txt: [('-', '')]
TMM18980301-V10-03-page6.txt: [('C--', 'C-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('..-', '..'), ('-', ''), ('-', '')]
TMM18980401-V10-04-page15.txt: [('-', '')]
TMM18980401-V10-04-page17.txt: [('-', '')]
TMM18980401-V10-04-page26.txt: [('HISTOR-', 'HISTOR')]
TMM18980401-V10-04-page3.txt: [('-', '')]
TMM18980401-V10-04-page30.txt: [('-', '')]
TMM18980401-V10-04-page31.txt: [('-', '')]
TMM18980401-V10-04-page33.txt: [('-I', 'I')]
TMM18980401-V10-04-page38.txt: [('encourag-', 'encourag')]
TMM18980401-V10-04-page4.txt: [('-', ''), ('-c-', 'c-'), ('-', ''), ('s-', 's'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('e-', 'e'), ('-', ''), ('-.t.', '.t.'), ('-', ''), ('-', ''), ('-', ''), ('..-', '..'), ('-', '')]
TMM18980401-V10-04-page40.txt: [('--Did', '-Did')]
TMM18980401-V10-04-page6.txt: [('WORK-', 'WORK')]
TMM18980501-V10-05-page17.txt: [('-', '')]
TMM18980501-V10-05-page24.txt: [('-T.', 'T.')]
TMM18980501-V10-05-page25.txt: [('-', ''), ('-', '')]
TMM18980501-V10-05-page28.txt: [('-.', '.'), ('i-', 'i'), ('-d', 'd'), ('-', ''), ('.-', '.'), ('-', ''), ('-s-azppos', 's-azppos')]
TMM18980501-V10-05-page29.txt: [('-', '')]
TMM18980501-V10-05-page30.txt: [('-', '')]
TMM18980501-V10-05-page31.txt: [('"Teu-', '"Teu')]
TMM18980501-V10-05-page35.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', '')]
TMM18980501-V10-05-page36.txt: [('-', ''), ('.-', '.')]
TMM18980501-V10-05-page37.txt: [('has-', 'has')]
TMM18980501-V10-05-page38.txt: [('-THREE', 'THREE')]
TMM18980501-V10-05-page39.txt: [('re-', 're')]
TMM18980601-V10-06-page16.txt: [('-', ''), ('-', '')]
TMM18980601-V10-06-page20.txt: [('-------A--', '------A--'), ('-.', '.'), ('..-', '..'), ('--', '-'), ('--', '-'), ('--..', '-..'), ('-...t', '...t'), ('..-', '..'), ('-', ''), ("....-k'-", "....-k'"), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('K-', 'K'), ('-', ''), ('.-', '.'), ('..--', '..-'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-AN', 'AN'), ('-', ''), ('--', '-'), ('ir-', 'ir'), ('-', ''), ('--', '-'), ('-AI', 'AI'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('.--', '.-'), ('---', '--')]
TMM18980601-V10-06-page22.txt: [('lb.-', 'lb.')]
TMM18980601-V10-06-page23.txt: [('-JUNE', 'JUNE'), ('-', '')]
TMM18980601-V10-06-page24.txt: [('-', ''), ('-', '')]
TMM18980601-V10-06-page25.txt: [('-', ''), ('-', '')]
TMM18980601-V10-06-page26.txt: [('doing-', 'doing')]
TMM18980601-V10-06-page36.txt: [('-', ''), ('leav-', 'leav')]
TMM18980701-V10-07-page10.txt: [('-', '')]
TMM18980701-V10-07-page11.txt: [('-lying', 'lying')]
TMM18980701-V10-07-page12.txt: [('--How', '-How')]
TMM18980701-V10-07-page30.txt: [('Amsterdam-', 'Amsterdam')]
TMM18980701-V10-07-page36.txt: [('inter-', 'inter')]
TMM18980701-V10-07-page38.txt: [('-c', 'c'), ('..-', '..'), ('-', ''), ('-', '')]
TMM18980701-V10-07-page4.txt: [('num-', 'num'), ('-', '')]
TMM18980701-V10-07-page40.txt: [('MAG-', 'MAG')]
TMM18980701-V10-07-page42.txt: [('-', '')]
TMM18980701-V10-07-page6.txt: [('Young-', 'Young')]
TMM18980801-V10-08-page21.txt: [('Euro-', 'Euro')]
TMM18980801-V10-08-page24.txt: [('conse-', 'conse')]
TMM18980801-V10-08-page26.txt: [('MISSIONARY-', 'MISSIONARY')]
TMM18980801-V10-08-page31.txt: [('z-', 'z'), ('bountifully."-', 'bountifully."')]
TMM18980801-V10-08-page35.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('.-', '.'), ('.-', '.')]
TMM18980801-V10-08-page38.txt: [('MIS-', 'MIS'), ('MIS-', 'MIS')]
TMM18980901-V10-09-page13.txt: [('-in', 'in')]
TMM18980901-V10-09-page16.txt: [('house-', 'house')]
TMM18980901-V10-09-page31.txt: [('-refining', 'refining'), ('-', '')]
TMM18980901-V10-09-page35.txt: [('me-', 'me')]
TMM18980901-V10-09-page37.txt: [('tear-drops--', 'tear-drops-')]
TMM18980901-V10-09-page8.txt: [('-', '')]
TMM18980901-V10-09-page9.txt: [('-view', 'view')]
TMM18981001-V10-10-page11.txt: [('-and', 'and'), ('-Fiance', 'Fiance'), ('-', '')]
TMM18981001-V10-10-page16.txt: [('-would', 'would')]
TMM18981001-V10-10-page18.txt: [('-', '')]
TMM18981001-V10-10-page19.txt: [('-', '')]
TMM18981001-V10-10-page21.txt: [('-rented', 'rented'), ('-', '')]
TMM18981001-V10-10-page28.txt: [('-', ''), ('edu-', 'edu')]
TMM18981001-V10-10-page30.txt: [('-', '')]
TMM18981001-V10-10-page33.txt: [('-', '')]
TMM18981001-V10-10-page34.txt: [('-', ''), ('-', '')]
TMM18981001-V10-10-page37.txt: [('-', ''), ('-c', 'c')]
TMM18981001-V10-10-page38.txt: [('-', '')]
TMM18981101-V10-11-page12.txt: [('-', '')]
TMM18981101-V10-11-page17.txt: [('MIS-', 'MIS')]
TMM18981101-V10-11-page20.txt: [('cor-', 'cor')]
TMM18981101-V10-11-page25.txt: [('MAGA-', 'MAGA')]
TMM18981101-V10-11-page27.txt: [('fol-', 'fol')]
TMM18981101-V10-11-page29.txt: [('-rendering', 'rendering')]
TMM18981101-V10-11-page30.txt: [('-', ''), ('liter-', 'liter')]
TMM18981101-V10-11-page31.txt: [('-', '')]
TMM18981101-V10-11-page33.txt: [('-at', 'at')]
TMM18981101-V10-11-page34.txt: [('-', '')]
TMM18981101-V10-11-page35.txt: [('-', '')]
TMM18981101-V10-11-page36.txt: [('SOCI-', 'SOCI'), ('.-', '.'), ('MIS-', 'MIS'), ('QUAR-', 'QUAR')]
TMM18981101-V10-11-page37.txt: [('-', ''), ('MAGA-', 'MAGA')]
TMM18981101-V10-11-page38.txt: [('MAGA-', 'MAGA')]
TMM18981101-V10-11-page6.txt: [('great-', 'great')]
TMM18981101-V10-11-page7.txt: [('-', '')]
TMM18981201-V10-12-page13.txt: [('igno-', 'igno')]
TMM18981201-V10-12-page17.txt: [('interme-', 'interme')]
TMM18981201-V10-12-page19.txt: [('-little', 'little')]
TMM18981201-V10-12-page2.txt: [('-mighty', 'mighty'), ('op-', 'op')]
TMM18981201-V10-12-page23.txt: [('-her.', 'her.')]
TMM18981201-V10-12-page27.txt: [('-', '')]
TMM18981201-V10-12-page32.txt: [('-', '')]
TMM18981201-V10-12-page36.txt: [('Sab-', 'Sab')]
TMM18981201-V10-12-page4.txt: [('-', '')]
TMM18981201-V10-12-page41.txt: [('-', '')]
TMM18981201-V10-12-page43.txt: [('--Near', '-Near'), ('-THE', 'THE')]
TMM18981201-V10-12-page44.txt: [('Par-', 'Par')]
TMM18981201-V10-12-page45.txt: [('Character-', 'Character')]
TMM18981201-V10-12-page46.txt: [('Mission-', 'Mission')]
TMM18990101-V11-01-page12.txt: [('-teach', 'teach'), ('-standing', 'standing'), ('-the', 'the')]
TMM18990101-V11-01-page13.txt: [('-', '')]
TMM18990101-V11-01-page14.txt: [('-', ''), ("'-", "'"), ('oranges-and-', 'oranges-and'), ('-I', 'I'), ('-', ''), ('-of', 'of'), ('-mines', 'mines'), ('-', '')]
TMM18990101-V11-01-page17.txt: [('flower-', 'flower')]
TMM18990101-V11-01-page19.txt: [('-', '')]
TMM18990101-V11-01-page2.txt: [('THI-', 'THI')]
TMM18990101-V11-01-page25.txt: [('-', '')]
TMM18990101-V11-01-page27.txt: [('-the', 'the'), ('-people', 'people'), ('-their', 'their'), ('-of', 'of')]
TMM18990101-V11-01-page28.txt: [('-', '')]
TMM18990101-V11-01-page29.txt: [('DAR-ES-', 'DAR-ES'), ('MIS-', 'MIS')]
TMM18990101-V11-01-page31.txt: [('the-', 'the')]
TMM18990101-V11-01-page32.txt: [('-voyage.', 'voyage.')]
TMM18990101-V11-01-page36.txt: [('success-', 'success')]
TMM18990101-V11-01-page38.txt: [('whole-', 'whole')]
TMM18990101-V11-01-page44.txt: [('stop--', 'stop-')]
TMM18990101-V11-01-page45.txt: [('-', '')]
TMM18990101-V11-01-page47.txt: [('-', ''), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990101-V11-01-page48.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('repre-', 'repre'), ('-.', '.')]
TMM18990101-V11-01-page8.txt: [('-rainy', 'rainy')]
TMM18990101-V11-01-page9.txt: [('-', ''), ('Spanish-', 'Spanish')]
TMM18990201-V11-02-page1.txt: [('Guiana-', 'Guiana')]
TMM18990201-V11-02-page13.txt: [('-', '')]
TMM18990201-V11-02-page15.txt: [('-', '')]
TMM18990201-V11-02-page18.txt: [('un-', 'un')]
TMM18990201-V11-02-page23.txt: [('-', '')]
TMM18990201-V11-02-page25.txt: [('-', '')]
TMM18990201-V11-02-page28.txt: [('-', '')]
TMM18990201-V11-02-page29.txt: [('WORK-', 'WORK')]
TMM18990201-V11-02-page30.txt: [('-as', 'as'), ('-development', 'development'), ("-of'", "of'"), ('suf-', 'suf')]
TMM18990201-V11-02-page38.txt: [('-they', 'they')]
TMM18990201-V11-02-page41.txt: [('con-', 'con')]
TMM18990201-V11-02-page47.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18990201-V11-02-page48.txt: [('inquir-', 'inquir')]
TMM18990201-V11-02-page51.txt: [('-', '')]
TMM18990201-V11-02-page52.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', ''), ('.-', '.'), ('.-', '.'), ('-', ''), ('.-', '.'), ('-', ''), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('-', ''), ('r.-', 'r.'), ('.-', '.')]
TMM18990201-V11-02-page53.txt: [('other-', 'other')]
TMM18990201-V11-02-page54.txt: [('-', ''), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990201-V11-02-page55.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ("-'", "'"), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-P', 'P')]
TMM18990201-V11-02-page9.txt: [('-', '')]
TMM18990301-V11-03-page11.txt: [('for-', 'for'), ('-', '')]
TMM18990301-V11-03-page13.txt: [('-', ''), ("----'", "---'")]
TMM18990301-V11-03-page25.txt: [('-"Christian', '"Christian')]
TMM18990301-V11-03-page29.txt: [('HALE.-', 'HALE.')]
TMM18990301-V11-03-page30.txt: [('-still', 'still')]
TMM18990301-V11-03-page31.txt: [('HOFFMAN-', 'HOFFMAN')]
TMM18990301-V11-03-page37.txt: [('-', ''), ('con-', 'con')]
TMM18990301-V11-03-page38.txt: [('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990301-V11-03-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.', '.')]
TMM18990301-V11-03-page40.txt: [('-', '')]
TMM18990301-V11-03-page7.txt: [('-Burrus', 'Burrus')]
TMM18990301-V11-03-page9.txt: [('frame-', 'frame')]
TMM18990401-V11-04-page10.txt: [('the-', 'the')]
TMM18990401-V11-04-page18.txt: [('-', '')]
TMM18990401-V11-04-page23.txt: [('-', '')]
TMM18990401-V11-04-page26.txt: [('-and', 'and')]
TMM18990401-V11-04-page29.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18990401-V11-04-page38.txt: [('Indo-', 'Indo'), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990401-V11-04-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-I-', 'I-'), ('PreNit-', 'PreNit'), ('-', ''), ('repre-', 'repre'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM18990501-V11-05-page16.txt: [('-Syria', 'Syria')]
TMM18990501-V11-05-page24.txt: [('of-', 'of')]
TMM18990501-V11-05-page31.txt: [('English-', 'English'), ('French-', 'French')]
TMM18990501-V11-05-page34.txt: [('-SABBATH', 'SABBATH')]
TMM18990501-V11-05-page35.txt: [('-', ''), ('-', '')]
TMM18990501-V11-05-page37.txt: [('-', '')]
TMM18990501-V11-05-page39.txt: [('-', '')]
TMM18990501-V11-05-page41.txt: [('-', '')]
TMM18990501-V11-05-page42.txt: [('MIS-', 'MIS'), ('QUAR-', 'QUAR'), ('-', ''), ('.-', '.'), ('-', '')]
TMM18990501-V11-05-page43.txt: [('con-', 'con')]
TMM18990501-V11-05-page45.txt: [('--IA', '-IA')]
TMM18990501-V11-05-page46.txt: [('-', ''), ('time.-', 'time.'), ('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION')]
TMM18990501-V11-05-page47.txt: [('-.', '.'), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('repre-', 'repre')]
TMM18990601-V11-06-page1.txt: [('-wide', 'wide')]
TMM18990601-V11-06-page11.txt: [('-', ''), ('-every', 'every'), ('fever.-', 'fever.'), ('-no', 'no')]
TMM18990601-V11-06-page12.txt: [('-lines', 'lines')]
TMM18990601-V11-06-page2.txt: [('-is', 'is')]
TMM18990601-V11-06-page3.txt: [('-is', 'is')]
TMM18990601-V11-06-page30.txt: [('-', '')]
TMM18990601-V11-06-page38.txt: [('-', '')]
TMM18990601-V11-06-page39.txt: [('-almost', 'almost'), ('-heathen', 'heathen'), ('-uses', 'uses'), ('-the', 'the'), ('-very', 'very'), ('-', '')]
TMM18990601-V11-06-page4.txt: [('-we', 'we'), ('-DO', 'DO'), ('work.-', 'work.'), ('-for', 'for')]
TMM18990601-V11-06-page46.txt: [('PHILADEL-', 'PHILADEL'), ('MISSION-', 'MISSION'), ('-', '')]
TMM18990601-V11-06-page47.txt: [('-', '')]
TMM18990601-V11-06-page7.txt: [('things.--', 'things.-')]
TMM18990701-V11-07-page11.txt: [('ex-', 'ex'), ('p-', 'p')]
TMM18990701-V11-07-page13.txt: [('-the', 'the')]
TMM18990701-V11-07-page16.txt: [('the-', 'the')]
TMM18990701-V11-07-page17.txt: [('-of', 'of'), ('-these', 'these'), ('-the', 'the'), ('-to', 'to')]
TMM18990701-V11-07-page19.txt: [('reading--', 'reading-')]
TMM18990701-V11-07-page2.txt: [('-the', 'the'), ('-term', 'term'), ('"When-', '"When'), ('-teacher', 'teacher'), ('-to', 'to')]
TMM18990701-V11-07-page20.txt: [('theerec-', 'theerec')]
TMM18990701-V11-07-page23.txt: [('-provision', 'provision'), ('reached-', 'reached'), ('Iztaccihuatl-', 'Iztaccihuatl')]
TMM18990701-V11-07-page26.txt: [('V-', 'V')]
TMM18990701-V11-07-page27.txt: [('so--', 'so-')]
TMM18990701-V11-07-page28.txt: [('-because', 'because')]
TMM18990701-V11-07-page32.txt: [('.-', '.'), ('-', '')]
TMM18990701-V11-07-page36.txt: [('formerly-', 'formerly'), ('receiving-', 'receiving')]
TMM18990701-V11-07-page37.txt: [('pray-', 'pray'), ('-era.', 'era.')]
TMM18990701-V11-07-page4.txt: [('-recent', 'recent')]
TMM18990701-V11-07-page40.txt: [('-', '')]
TMM18990701-V11-07-page42.txt: [('-', ''), ('of-', 'of')]
TMM18990701-V11-07-page43.txt: [('-fices', 'fices')]
TMM18990701-V11-07-page46.txt: [('the-', 'the'), ('MIS-', 'MIS')]
TMM18990701-V11-07-page5.txt: [('gov-', 'gov'), ('-to', 'to')]
TMM18990701-V11-07-page9.txt: [('Fahren-', 'Fahren')]
TMM18990801-V11-08-page11.txt: [('-.', '.'), ('-', ''), ('-', ''), ('-.', '.'), ('-E', 'E'), ('--s', '-s'), ('-"', '"'), ('-I.', 'I.'), ('C".-', 'C".'), ('-t-', 't-'), ('-', ''), ('-C', 'C'), ("-'", "'"), ('-', ''), ('Vcc-', 'Vcc'), ('-', ''), ('-', ''), ('-l', 'l'), ('-c', 'c'), ('-P', 'P'), ('TIc.-', 'TIc.'), ('-Lt.', 'Lt.'), ('a-', 'a'), ('-C', 'C'), ('-.', '.'), ('-c', 'c'), ('-c', 'c'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('m-', 'm'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', ''), ('-c', 'c'), ('..-', '..'), ('-L', 'L'), ('lec-', 'lec'), ('-', ''), ('.F-', '.F'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-se-tt', 'se-tt'), ('N-', 'N'), ('-', ''), ("it'-", "it'"), ('-V', 'V'), ('-', ''), ('-', ''), ('-', ''), ('iV-', 'iV'), ('-', ''), ('-', ''), ('I.-', 'I.'), ('-', ''), ('-', ''), ('-.', '.'), ("-'", "'"), ('--', '-'), ('-it', 'it'), ('mew.Pgx-', 'mew.Pgx'), ("-T'..", "T'.."), ('-', ''), ('lectlf-', 'lectlf'), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('-.c', '.c'), ('-', ''), ('-.r.', '.r.'), ('-P', 'P'), ('-', ''), ('-', ''), (".''rt.Mgk-", ".''rt.Mgk"), ('-', ''), ('-.', '.'), ('-c', 'c'), ('-', ''), ('-', ''), ('cte-', 'cte'), ("-'", "'"), ('-', ''), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('.-', '.'), ('-c', 'c'), ('-', ''), ('Llc-', 'Llc'), ('-', ''), ('-LAE', 'LAE'), ('-', ''), ('-', ''), ('-ore', 'ore')]
TMM18990801-V11-08-page18.txt: [('-', '')]
TMM18990801-V11-08-page2.txt: [('-', '')]
TMM18990801-V11-08-page21.txt: [('-', '')]
TMM18990801-V11-08-page31.txt: [('-upon', 'upon'), ('-that', 'that'), ('-.means', '.means')]
TMM18990801-V11-08-page32.txt: [('-', ''), ('-', '')]
TMM18990801-V11-08-page33.txt: [('-that', 'that'), ('-thousand', 'thousand'), ('-', '')]
TMM18990801-V11-08-page36.txt: [('orr-', 'orr')]
TMM18990801-V11-08-page37.txt: [('-"like', '"like'), ('-that', 'that'), ('-', '')]
TMM18990801-V11-08-page40.txt: [('-that', 'that')]
TMM18990801-V11-08-page41.txt: [('-emptied', 'emptied'), ('Him-', 'Him'), ('-self', 'self'), ('-consume', 'consume'), ('-', '')]
TMM18990801-V11-08-page43.txt: [('-', '')]
TMM18990801-V11-08-page45.txt: [('MIS-', 'MIS'), ('.-', '.'), ('-', ''), ('-', '')]
TMM18990801-V11-08-page46.txt: [('-OFFICE', 'OFFICE'), ('-', ''), ('MIS-', 'MIS')]
TMM18990801-V11-08-page9.txt: [('-', '')]
TMM18990901-V11-09-page1.txt: [('-', '')]
TMM18990901-V11-09-page23.txt: [('-', '')]
TMM18990901-V11-09-page25.txt: [('-the', 'the'), ('-they', 'they'), ('-teach', 'teach')]
TMM18990901-V11-09-page27.txt: [('-to', 'to')]
TMM18990901-V11-09-page3.txt: [('-wa-re', 'wa-re'), ('MISSION-', 'MISSION')]
TMM18990901-V11-09-page34.txt: [('-', '')]
TMM18990901-V11-09-page36.txt: [('.-', '.')]
TMM18990901-V11-09-page44.txt: [('-', '')]
TMM18990901-V11-09-page46.txt: [('-', ''), ('MIS-', 'MIS')]
TMM18990901-V11-09-page47.txt: [('-A--', 'A--'), ('-', ''), ('-al', 'al'), ('-', '')]
TMM18991001-V11-10-page1.txt: [('wit-', 'wit')]
TMM18991001-V11-10-page10.txt: [('--', '-')]
TMM18991001-V11-10-page14.txt: [('-', ''), ('QUEENSLAND.-', 'QUEENSLAND.')]
TMM18991001-V11-10-page16.txt: [('-', '')]
TMM18991001-V11-10-page2.txt: [('-', '')]
TMM18991001-V11-10-page3.txt: [('-that', 'that')]
TMM18991001-V11-10-page30.txt: [('-devote', 'devote')]
TMM18991001-V11-10-page4.txt: [('-', '')]
TMM18991001-V11-10-page44.txt: [('-', ''), ('be-', 'be')]
TMM18991001-V11-10-page45.txt: [('-What', 'What')]
TMM18991001-V11-10-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM18991001-V11-10-page5.txt: [('Mc-', 'Mc'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM18991101-V11-11-page12.txt: [('-we', 'we')]
TMM18991101-V11-11-page23.txt: [('.-', '.'), ("c'te-", "c'te"), ('-', ''), ('i"-', 'i"'), ("-'...", "'..."), ('-', ''), ('F--', 'F-'), ('C-', 'C'), ('-.', '.')]
TMM18991101-V11-11-page24.txt: [('-', ''), ('-', ''), ('-a', 'a')]
TMM18991101-V11-11-page27.txt: [('-to', 'to')]
TMM18991101-V11-11-page32.txt: [('-numbered', 'numbered'), ('-', '')]
TMM18991101-V11-11-page33.txt: [('-holy', 'holy')]
TMM18991101-V11-11-page37.txt: [('MAGA-', 'MAGA')]
TMM18991101-V11-11-page40.txt: [('-', ''), ('---First', '--First')]
TMM18991101-V11-11-page42.txt: [('MIS-', 'MIS')]
TMM18991101-V11-11-page43.txt: [('sur-', 'sur')]
TMM18991101-V11-11-page44.txt: [('-', ''), ('Side-', 'Side')]
TMM18991101-V11-11-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM18991101-V11-11-page47.txt: [('r-', 'r'), ('-', ''), ('A-', 'A')]
TMM18991201-V11-12-page18.txt: [('.-', '.')]
TMM18991201-V11-12-page20.txt: [('-', '')]
TMM18991201-V11-12-page29.txt: [('-', '')]
TMM18991201-V11-12-page3.txt: [('mist-', 'mist')]
TMM18991201-V11-12-page34.txt: [('-than', 'than')]
TMM18991201-V11-12-page36.txt: [('-', '')]
TMM18991201-V11-12-page37.txt: [('-', '')]
TMM18991201-V11-12-page38.txt: [('-', ''), ('-', '')]
TMM18991201-V11-12-page39.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18991201-V11-12-page40.txt: [('-', ''), ('-', ''), ('-', '')]
TMM18991201-V11-12-page41.txt: [('-', ''), ('-', '')]
TMM18991201-V11-12-page42.txt: [('-', '')]
TMM18991201-V11-12-page45.txt: [('-', '')]
TMM18991201-V11-12-page46.txt: [('-', ''), ('MIS-', 'MIS')]
TMM18991201-V11-12-page9.txt: [('-', '')]
TMM19000101-V12-01-page10.txt: [('-i', 'i'), ('-.', '.'), ('-', ''), ('--', '-'), ('---', '--'), ('------', '-----'), ('A-', 'A'), ('-', ''), ('-', ''), ("-.'", ".'"), ('-', ''), ('-', ''), ('..-', '..'), ('-.Z---....', '.Z---....'), ('-', ''), ("--.'b", "-.'b"), ('-', ''), ('X-', 'X'), ("-l'''", "l'''"), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', ''), ('-', ''), ('..-', '..')]
TMM19000101-V12-01-page2.txt: [('con-', 'con')]
TMM19000101-V12-01-page25.txt: [('self-', 'self')]
TMM19000101-V12-01-page3.txt: [('-v-tvot', 'v-tvot')]
TMM19000101-V12-01-page30.txt: [('-drew', 'drew'), ('primi-', 'primi')]
TMM19000101-V12-01-page33.txt: [('-', '')]
TMM19000101-V12-01-page34.txt: [('-', '')]
TMM19000101-V12-01-page37.txt: [('MAGA-', 'MAGA')]
TMM19000101-V12-01-page38.txt: [('MAG-', 'MAG'), ('-', ''), ('-', '')]
TMM19000101-V12-01-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000101-V12-01-page4.txt: [('Nice."-', 'Nice."')]
TMM19000101-V12-01-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('judg-', 'judg'), ('-', '')]
TMM19000101-V12-01-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000101-V12-01-page42.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000101-V12-01-page43.txt: [('-', ''), ('-', '')]
TMM19000101-V12-01-page44.txt: [('MIS-', 'MIS')]
TMM19000101-V12-01-page46.txt: [('-BISHOP', 'BISHOP')]
TMM19000101-V12-01-page47.txt: [('MAGA-', 'MAGA'), ('MISSION-', 'MISSION')]
TMM19000101-V12-01-page48.txt: [('-MISSIONARY', 'MISSIONARY')]
TMM19000101-V12-01-page50.txt: [('MAGA-', 'MAGA'), ('MIS-', 'MIS')]
TMM19000101-V12-01-page51.txt: [('-.Seventh', '.Seventh'), ('-page', 'page'), ('earn-', 'earn'), ('-', ''), ('PRO-', 'PRO')]
TMM19000101-V12-01-page52.txt: [('DEVELOP-', 'DEVELOP')]
TMM19000101-V12-01-page6.txt: [('un-', 'un')]
TMM19000201-V12-02-page1.txt: [('IN-', 'IN')]
TMM19000201-V12-02-page13.txt: [('-', '')]
TMM19000201-V12-02-page15.txt: [('weak-', 'weak')]
TMM19000201-V12-02-page19.txt: [('-cannot', 'cannot')]
TMM19000201-V12-02-page2.txt: [('-sold.', 'sold.')]
TMM19000201-V12-02-page3.txt: [('-', '')]
TMM19000201-V12-02-page32.txt: [('-', '')]
TMM19000201-V12-02-page33.txt: [('-', '')]
TMM19000201-V12-02-page34.txt: [('MAGAZINE-', 'MAGAZINE'), ('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page35.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page36.txt: [('-', ''), ('-', ''), ('Medo-', 'Medo'), ('corre-', 'corre'), ('-each', 'each'), ('-', ''), ('-', '')]
TMM19000201-V12-02-page37.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page38.txt: [('-', ''), ('-', '')]
TMM19000201-V12-02-page39.txt: [('-', ''), ('-', ''), ('-', ''), ('-MARCH', 'MARCH')]
TMM19000201-V12-02-page40.txt: [('-', ''), ('-', '')]
TMM19000201-V12-02-page41.txt: [('-', '')]
TMM19000201-V12-02-page43.txt: [('mission-', 'mission')]
TMM19000201-V12-02-page44.txt: [('-farm', 'farm'), ('-their', 'their')]
TMM19000201-V12-02-page46.txt: [('MIS-', 'MIS')]
TMM19000201-V12-02-page47.txt: [('in-', 'in')]
TMM19000201-V12-02-page49.txt: [('Miss-', 'Miss')]
TMM19000201-V12-02-page50.txt: [('-', ''), ('MIS-', 'MIS')]
TMM19000201-V12-02-page51.txt: [('ANIMAL."PRO-', 'ANIMAL."PRO'), ('-', ''), ('-text', 'text'), ('-', ''), ('-tiFFI', 'tiFFI'), ('H.-', 'H.'), ('-', ''), ('-', ''), ('-page', 'page'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000201-V12-02-page52.txt: [('-', ''), ('al-', 'al')]
TMM19000301-V12-03-page10.txt: [('Hongkong----', 'Hongkong---'), ('-', ''), ('-who', 'who')]
TMM19000301-V12-03-page11.txt: [('-', '')]
TMM19000301-V12-03-page13.txt: [('table-', 'table')]
TMM19000301-V12-03-page18.txt: [('-', ''), ('going-', 'going')]
TMM19000301-V12-03-page2.txt: [('-', ''), ('-', '')]
TMM19000301-V12-03-page26.txt: [('-', '')]
TMM19000301-V12-03-page34.txt: [('-', ''), ('na-', 'na')]
TMM19000301-V12-03-page35.txt: [('-', '')]
TMM19000301-V12-03-page36.txt: [('-', ''), ('-', '')]
TMM19000301-V12-03-page39.txt: [('-', '')]
TMM19000301-V12-03-page41.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page44.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000301-V12-03-page45.txt: [('Superin-', 'Superin'), ('-', ''), ('thy-', 'thy'), ('--Ider', '-Ider'), ('-school', 'school'), ('-turn', 'turn')]
TMM19000301-V12-03-page47.txt: [('-a', 'a'), ('-a', 'a'), ('C-', 'C')]
TMM19000301-V12-03-page48.txt: [('-', ''), ('-ton', 'ton')]
TMM19000301-V12-03-page5.txt: [('expres-', 'expres'), ('-They', 'They')]
TMM19000301-V12-03-page8.txt: [('Wall-', 'Wall'), ('-', '')]
TMM19000301-V12-03-page9.txt: [('M-', 'M'), ('-c.', 'c.'), ('X-', 'X'), ('---', '--')]
TMM19000401-V12-04-page1.txt: [('-', '')]
TMM19000401-V12-04-page14.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000401-V12-04-page15.txt: [('-', '')]
TMM19000401-V12-04-page2.txt: [('con-', 'con')]
TMM19000401-V12-04-page33.txt: [('-future', 'future')]
TMM19000401-V12-04-page39.txt: [('-', '')]
TMM19000401-V12-04-page40.txt: [('-utmost', 'utmost')]
TMM19000401-V12-04-page43.txt: [('-', '')]
TMM19000401-V12-04-page44.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000401-V12-04-page45.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000401-V12-04-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('expedi-', 'expedi')]
TMM19000401-V12-04-page47.txt: [('-', ''), ('-', '')]
TMM19000401-V12-04-page48.txt: [('-public', 'public')]
TMM19000401-V12-04-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000401-V12-04-page51.txt: [('-r', 'r'), ('-need-to', 'need-to'), ('Sub-', 'Sub'), ('C-', 'C'), ('Postpaid.-', 'Postpaid.'), ('-', '')]
TMM19000401-V12-04-page52.txt: [('-York.', 'York.')]
TMM19000401-V12-04-page7.txt: [('-', ''), ('-', '')]
TMM19000401-V12-04-page8.txt: [('con-', 'con')]
TMM19000501-V12-05-page10.txt: [('Saint-', 'Saint')]
TMM19000501-V12-05-page11.txt: [('-', '')]
TMM19000501-V12-05-page12.txt: [('cere-', 'cere')]
TMM19000501-V12-05-page14.txt: [('examina-', 'examina')]
TMM19000501-V12-05-page15.txt: [('-', '')]
TMM19000501-V12-05-page22.txt: [('-', '')]
TMM19000501-V12-05-page25.txt: [('-', ''), ('---', '--'), ('-------', '------'), ('--.', '-.'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('----', '---'), ('--.', '-.'), ('-', ''), ('-.-.', '.-.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('.----', '.---'), ('---', '--'), ('------.', '-----.'), ('-', ''), ('t.---', 't.--'), ('-', ''), ('-', ''), ('----.-', '---.-'), ('-', '')]
TMM19000501-V12-05-page26.txt: [('third-', 'third')]
TMM19000501-V12-05-page29.txt: [('restric-', 'restric')]
TMM19000501-V12-05-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page32.txt: [('MISSION-', 'MISSION')]
TMM19000501-V12-05-page37.txt: [('re-', 're')]
TMM19000501-V12-05-page39.txt: [('-MAY', 'MAY'), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page41.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-JUNE', 'JUNE'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000501-V12-05-page44.txt: [('-', '')]
TMM19000501-V12-05-page45.txt: [('MIS-', 'MIS')]
TMM19000501-V12-05-page5.txt: [('sol-', 'sol'), ('second-', 'second')]
TMM19000501-V12-05-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000501-V12-05-page51.txt: [('A-', 'A'), ('God.-', 'God.'), ('be-', 'be'), ('be-', 'be'), ('-', ''), ('A-', 'A'), ('-', ''), ("'si-", "'si"), ('-', ''), ('A-', 'A'), ('-', ''), ('-', '')]
TMM19000501-V12-05-page6.txt: [('idola-', 'idola')]
TMM19000601-V12-06-page12.txt: [('-', '')]
TMM19000601-V12-06-page13.txt: [('-the', 'the')]
TMM19000601-V12-06-page19.txt: [('ap-', 'ap')]
TMM19000601-V12-06-page23.txt: [('surround-', 'surround')]
TMM19000601-V12-06-page27.txt: [('-', '')]
TMM19000601-V12-06-page28.txt: [('con-', 'con')]
TMM19000601-V12-06-page29.txt: [('amplifi-', 'amplifi'), ('-', '')]
TMM19000601-V12-06-page33.txt: [('the-', 'the')]
TMM19000601-V12-06-page34.txt: [('-', '')]
TMM19000601-V12-06-page37.txt: [('-', ''), ('devasta-', 'devasta')]
TMM19000601-V12-06-page38.txt: [('-acre', 'acre')]
TMM19000601-V12-06-page39.txt: [('-foot', 'foot')]
TMM19000601-V12-06-page44.txt: [('com-', 'com'), ('-', ''), ('-', '')]
TMM19000601-V12-06-page45.txt: [('-', ''), ('-', '')]
TMM19000601-V12-06-page48.txt: [('-', '')]
TMM19000601-V12-06-page49.txt: [('cur-', 'cur')]
TMM19000601-V12-06-page5.txt: [('prepara-', 'prepara')]
TMM19000601-V12-06-page50.txt: [('MIS-', 'MIS')]
TMM19000601-V12-06-page51.txt: [('A-', 'A'), ('A-', 'A'), ('be-', 'be'), ('-', '')]
TMM19000601-V12-06-page52.txt: [('-', '')]
TMM19000701-V12-07-page11.txt: [('reveren-', 'reveren')]
TMM19000701-V12-07-page13.txt: [('mem-', 'mem')]
TMM19000701-V12-07-page14.txt: [('-', '')]
TMM19000701-V12-07-page26.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000701-V12-07-page32.txt: [('con-', 'con')]
TMM19000701-V12-07-page34.txt: [('-', '')]
TMM19000701-V12-07-page36.txt: [('-', '')]
TMM19000701-V12-07-page44.txt: [('READING-', 'READING')]
TMM19000701-V12-07-page46.txt: [('-', ''), ('-', '')]
TMM19000701-V12-07-page47.txt: [('-', ''), ('-', '')]
TMM19000701-V12-07-page48.txt: [('-', '')]
TMM19000701-V12-07-page49.txt: [('-', '')]
TMM19000701-V12-07-page5.txt: [('-though', 'though')]
TMM19000701-V12-07-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000701-V12-07-page51.txt: [('A-', 'A'), ('be-', 'be'), ('A-', 'A')]
TMM19000701-V12-07-page6.txt: [('congrega-', 'congrega')]
TMM19000701-V12-07-page9.txt: [('promul-', 'promul'), ('sub-', 'sub')]
TMM19000801-V12-08-page1.txt: [('-Vol.', 'Vol.')]
TMM19000801-V12-08-page10.txt: [('funda-', 'funda')]
TMM19000801-V12-08-page17.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000801-V12-08-page19.txt: [('road-', 'road'), ('be-', 'be')]
TMM19000801-V12-08-page2.txt: [('con-', 'con')]
TMM19000801-V12-08-page20.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19000801-V12-08-page21.txt: [('appar-', 'appar')]
TMM19000801-V12-08-page23.txt: [('-', '')]
TMM19000801-V12-08-page26.txt: [('Astra-', 'Astra')]
TMM19000801-V12-08-page27.txt: [('Tscher-', 'Tscher')]
TMM19000801-V12-08-page28.txt: [('shin-', 'shin')]
TMM19000801-V12-08-page3.txt: [('-', '')]
TMM19000801-V12-08-page31.txt: [('--', '-')]
TMM19000801-V12-08-page33.txt: [('-', '')]
TMM19000801-V12-08-page38.txt: [('-', '')]
TMM19000801-V12-08-page39.txt: [('-', ''), ('-', '')]
TMM19000801-V12-08-page40.txt: [('v-', 'v')]
TMM19000801-V12-08-page44.txt: [('MIS-', 'MIS'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('-', ''), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')]
TMM19000801-V12-08-page45.txt: [('-', ''), ('.-', '.')]
TMM19000801-V12-08-page46.txt: [('-THE', 'THE')]
TMM19000801-V12-08-page47.txt: [('-', ''), ('-', '')]
TMM19000801-V12-08-page48.txt: [('pos-', 'pos')]
TMM19000801-V12-08-page49.txt: [('GA-', 'GA')]
TMM19000801-V12-08-page5.txt: [('View-', 'View'), ('Gos-', 'Gos')]
TMM19000801-V12-08-page50.txt: [('MtssioN-', 'MtssioN'), ('-', ''), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19000801-V12-08-page7.txt: [('presi-', 'presi')]
TMM19000901-V12-09-page11.txt: [('-', '')]
TMM19000901-V12-09-page16.txt: [('an-', 'an'), ('cere-', 'cere'), ('con-', 'con')]
TMM19000901-V12-09-page24.txt: [('-to', 'to')]
TMM19000901-V12-09-page25.txt: [('-', ''), ('man-', 'man')]
TMM19000901-V12-09-page28.txt: [('di-', 'di')]
TMM19000901-V12-09-page29.txt: [('-return', 'return'), ('--', '-'), ('rep-', 'rep')]
TMM19000901-V12-09-page32.txt: [('-', '')]
TMM19000901-V12-09-page33.txt: [('-one', 'one'), ('adelan-', 'adelan')]
TMM19000901-V12-09-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19000901-V12-09-page43.txt: [('-', '')]
TMM19000901-V12-09-page44.txt: [('-SEPTEMBER', 'SEPTEMBER'), ('-', ''), ('-', '')]
TMM19000901-V12-09-page45.txt: [('-', ''), ('-', '')]
TMM19000901-V12-09-page46.txt: [('-twice', 'twice')]
TMM19000901-V12-09-page47.txt: [('Ho-', 'Ho')]
TMM19000901-V12-09-page48.txt: [('-and', 'and'), ('-charge', 'charge'), ('-connection', 'connection')]
TMM19000901-V12-09-page49.txt: [('-', ''), ('-', '')]
TMM19000901-V12-09-page50.txt: [('MIS-', 'MIS'), ('.-', '.')]
TMM19000901-V12-09-page51.txt: [('con-', 'con'), ('-', '')]
TMM19000901-V12-09-page52.txt: [('Blau-', 'Blau')]
TMM19001001-V12-10-page1.txt: [('-i', 'i'), ('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-.', '.'), ('-', ''), ('-', '')]
TMM19001001-V12-10-page14.txt: [('chil-', 'chil')]
TMM19001001-V12-10-page15.txt: [('--that', '-that')]
TMM19001001-V12-10-page18.txt: [('-', '')]
TMM19001001-V12-10-page2.txt: [('-', ''), ('-Australia', 'Australia')]
TMM19001001-V12-10-page23.txt: [('Pi-', 'Pi')]
TMM19001001-V12-10-page29.txt: [('-', ''), ('hold-', 'hold')]
TMM19001001-V12-10-page30.txt: [('scamper-', 'scamper')]
TMM19001001-V12-10-page44.txt: [('E-', 'E'), ('-', '')]
TMM19001001-V12-10-page45.txt: [('-', '')]
TMM19001001-V12-10-page46.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('---', '--'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001001-V12-10-page47.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001001-V12-10-page49.txt: [('-paid', 'paid')]
TMM19001001-V12-10-page50.txt: [('--August', '-August'), ('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19001001-V12-10-page51.txt: [('-', '')]
TMM19001001-V12-10-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-BEARING', 'BEARING'), ('utensil-', 'utensil')]
TMM19001001-V12-10-page6.txt: [('fellowship-', 'fellowship'), ('influ-', 'influ')]
TMM19001001-V12-10-page8.txt: [('MISSION-', 'MISSION'), ('MAG-', 'MAG'), ('condi-', 'condi')]
TMM19001101-V12-11-page1.txt: [('-', '')]
TMM19001101-V12-11-page13.txt: [('believ-', 'believ')]
TMM19001101-V12-11-page15.txt: [('cab-', 'cab')]
TMM19001101-V12-11-page18.txt: [('igno-', 'igno')]
TMM19001101-V12-11-page19.txt: [('se-', 'se'), ('rever-', 'rever')]
TMM19001101-V12-11-page20.txt: [('French-', 'French')]
TMM19001101-V12-11-page22.txt: [('-United', 'United')]
TMM19001101-V12-11-page24.txt: [('produc-', 'produc'), ('-', ''), ('ap-', 'ap')]
TMM19001101-V12-11-page27.txt: [('-', ''), ('-', '')]
TMM19001101-V12-11-page31.txt: [('-the', 'the')]
TMM19001101-V12-11-page32.txt: [('-', '')]
TMM19001101-V12-11-page34.txt: [('-', '')]
TMM19001101-V12-11-page38.txt: [('-NovEmnEn', 'NovEmnEn'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page39.txt: [('-', ''), ('-Our', 'Our'), ('-', '')]
TMM19001101-V12-11-page40.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page41.txt: [('-That', 'That'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page42.txt: [('-DECEMBER', 'DECEMBER'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001101-V12-11-page43.txt: [('-', '')]
TMM19001101-V12-11-page44.txt: [('-', ''), ('-', ''), ('MIS-', 'MIS'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')]
TMM19001101-V12-11-page45.txt: [('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.'), ('.-', '.')]
TMM19001101-V12-11-page46.txt: [('-', ''), ('-', ''), ('.-', '.')]
TMM19001101-V12-11-page48.txt: [('-THE', 'THE')]
TMM19001101-V12-11-page49.txt: [('Sabbath-', 'Sabbath')]
TMM19001101-V12-11-page50.txt: [('MIS-', 'MIS')]
TMM19001101-V12-11-page51.txt: [('-', ''), ('makeCon-', 'makeCon'), ('-cured.', 'cured.'), ('.successful-', '.successful'), ('-', ''), ('-', ''), ('-name', 'name')]
TMM19001101-V12-11-page52.txt: [('-ought', 'ought'), ('-BEARING', 'BEARING'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ("-'not", "'not"), ('-With', 'With'), ('--', '-')]
TMM19001101-V12-11-page8.txt: [('-', '')]
TMM19001201-V12-12-page10.txt: [('ba-', 'ba')]
TMM19001201-V12-12-page14.txt: [('-', ''), ('-lb-', 'lb-'), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page19.txt: [('-', '')]
TMM19001201-V12-12-page2.txt: [('-Oakland', 'Oakland')]
TMM19001201-V12-12-page23.txt: [('-', '')]
TMM19001201-V12-12-page3.txt: [('Spirit-', 'Spirit')]
TMM19001201-V12-12-page30.txt: [('-', ''), ('-', '')]
TMM19001201-V12-12-page31.txt: [('lo-', 'lo')]
TMM19001201-V12-12-page34.txt: [('blast--', 'blast-')]
TMM19001201-V12-12-page35.txt: [('-', ''), ('-', '')]
TMM19001201-V12-12-page37.txt: [('under-', 'under')]
TMM19001201-V12-12-page39.txt: [('-the', 'the')]
TMM19001201-V12-12-page42.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page43.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page44.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page45.txt: [('-', ''), ('-', '')]
TMM19001201-V12-12-page47.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page48.txt: [('-Oun', 'Oun')]
TMM19001201-V12-12-page49.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19001201-V12-12-page50.txt: [('-', ''), ('-', ''), ('MIS-', 'MIS')]
TMM19001201-V12-12-page51.txt: [('-', '')]
TMM19001201-V12-12-page6.txt: [('an-', 'an'), ('Ad-', 'Ad'), ('-', '')]
TMM19001201-V12-12-page7.txt: [('advance-', 'advance')]
TMM19020101-V14-01-page1.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-.New', '.New'), ('-', ''), ('-', ''), ('-The', 'The'), ('-', ''), ('-', ''), ('Alaska-', 'Alaska'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-.Mayaguez', '.Mayaguez'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('U-', 'U')]
TMM19020101-V14-01-page12.txt: [('Natal-', 'Natal')]
TMM19020101-V14-01-page13.txt: [('salva-', 'salva')]
TMM19020101-V14-01-page15.txt: [('re-', 're')]
TMM19020101-V14-01-page16.txt: [('-', ''), ('-', '')]
TMM19020101-V14-01-page17.txt: [('resur-', 'resur')]
TMM19020101-V14-01-page18.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020101-V14-01-page2.txt: [('LIFT-', 'LIFT'), ('-PRICE', 'PRICE'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('l-', 'l')]
TMM19020101-V14-01-page23.txt: [('impos-', 'impos')]
TMM19020101-V14-01-page27.txt: [('dis-', 'dis')]
TMM19020101-V14-01-page29.txt: [('-', '')]
TMM19020101-V14-01-page31.txt: [('-', ''), ('Pe-', 'Pe')]
TMM19020101-V14-01-page38.txt: [('heav-', 'heav')]
TMM19020101-V14-01-page49.txt: [('nec-', 'nec')]
TMM19020101-V14-01-page50.txt: [('-', ''), ('SECOND-', 'SECOND'), ('MAG-', 'MAG')]
TMM19020101-V14-01-page7.txt: [('grow-', 'grow')]
TMM19020101-V14-01-page8.txt: [('-', ''), ('-page', 'page')]
TMM19020201-V14-02-page1.txt: [('-ii', 'ii'), ('-', '')]
TMM19020201-V14-02-page12.txt: [('-', ''), ('Cama-', 'Cama')]
TMM19020201-V14-02-page14.txt: [('jus-', 'jus')]
TMM19020201-V14-02-page15.txt: [('-', '')]
TMM19020201-V14-02-page2.txt: [('-', ''), ('-', ''), ('-', '')]
TMM19020201-V14-02-page21.txt: [('MISSION-', 'MISSION')]
TMM19020201-V14-02-page25.txt: [('-eyes', 'eyes'), ('Ital-', 'Ital')]
TMM19020201-V14-02-page33.txt: [('neces-', 'neces')]
TMM19020201-V14-02-page35.txt: [('-', '')]
TMM19020201-V14-02-page38.txt: [('Erken-', 'Erken')]
TMM19020201-V14-02-page45.txt: [('-', '')]
TMM19020201-V14-02-page46.txt: [('Okla-', 'Okla')]
TMM19020201-V14-02-page47.txt: [('Fund.-', 'Fund.'), ('-', ''), ('Relief.-', 'Relief.'), ('Tithe.-', 'Tithe.'), ('Donations.-', 'Donations.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('School.-', 'School.'), ('Mission.-', 'Mission.'), ('Mission.-', 'Mission.'), ('Field.-', 'Field.'), ('Africa.-', 'Africa.'), ('Mission.-', 'Mission.')]
TMM19020201-V14-02-page49.txt: [('-', ''), ('ARIZONA.-', 'ARIZONA.'), ('CALIFORNIA.-', 'CALIFORNIA.'), ('-', ''), ('CUMBER-', 'CUMBER'), ('FLORIDA.-', 'FLORIDA.'), ('GEORGIA.-', 'GEORGIA.'), ('ILLINOIS.-', 'ILLINOIS.'), ('TERRITORY.-', 'TERRITORY.'), ('KANSAS.-', 'KANSAS.'), ('LOUISI-', 'LOUISI'), ('ANA.-', 'ANA.'), ('MIcHIGAN.-', 'MIcHIGAN.'), ('MINNESOTA.-', 'MINNESOTA.'), ('MISSOURI.-', 'MISSOURI.'), ('NE-', 'NE'), ('BRASKA.-', 'BRASKA.'), ('YORK.-', 'YORK.'), ('CAROLINA.-', 'CAROLINA.'), ('Oxio.-', 'Oxio.'), ('TERRITORY.-', 'TERRITORY.'), ('OREGON.-', 'OREGON.'), ('PENNSYLVANIA.-', 'PENNSYLVANIA.'), ('DAKOTA.-', 'DAKOTA.'), ('TEXAS.-', 'TEXAS.'), ('VERMONT.-', 'VERMONT.'), ('-', ''), ('VIRGINIA.-', 'VIRGINIA.'), ('g.-', 'g.')]
TMM19020201-V14-02-page5.txt: [('-', '')]
TMM19020201-V14-02-page50.txt: [('SECOND-', 'SECOND'), ('Expirations.-', 'Expirations.'), ('MAG-', 'MAG'), ('TEUTONIC-', 'TEUTONIC'), ('CELTIC-', 'CELTIC'), ('-', '')]
TMM19020201-V14-02-page51.txt: [('-', '')]
TMM19020201-V14-02-page52.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('RAIL-', 'RAIL')]
TMM19020201-V14-02-page7.txt: [('Holland-', 'Holland')]
TMM19020201-V14-02-page8.txt: [('-', ''), ('Advent-', 'Advent')]
TMM19020301-V14-03-page1.txt: [('--', '-'), ('-', ''), ('-The', 'The'), ('Hungary-', 'Hungary'), ('-', ''), ('-In', 'In'), ('L-', 'L'), ('-', ''), ('--', '-'), ('-', ''), ('.-', '.'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('Tuna-', 'Tuna'), ('-', ''), ("'Children-", "'Children")]
TMM19020301-V14-03-page10.txt: [('country-', 'country'), ('prin-', 'prin')]
TMM19020301-V14-03-page12.txt: [('com-', 'com')]
TMM19020301-V14-03-page14.txt: [('meet-', 'meet'), ('de-', 'de')]
TMM19020301-V14-03-page17.txt: [('Mon-', 'Mon')]
TMM19020301-V14-03-page2.txt: [('-', ''), ('-Apply', 'Apply'), ('WathiOR-', 'WathiOR'), ('-', ''), ('-', ''), ('.-', '.')]
TMM19020301-V14-03-page21.txt: [('increas-', 'increas')]
TMM19020301-V14-03-page22.txt: [('-', '')]
TMM19020301-V14-03-page26.txt: [('-', ''), ('appear-', 'appear'), ('-', ''), ('-miles', 'miles'), ('-', ''), ('Ar-', 'Ar')]
TMM19020301-V14-03-page27.txt: [('mem-', 'mem')]
TMM19020301-V14-03-page3.txt: [('RAIL-', 'RAIL'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020301-V14-03-page31.txt: [('-', '')]
TMM19020301-V14-03-page35.txt: [('propor-', 'propor')]
TMM19020301-V14-03-page39.txt: [('IN-', 'IN')]
TMM19020301-V14-03-page40.txt: [('-A', 'A')]
TMM19020301-V14-03-page43.txt: [('-a', 'a')]
TMM19020301-V14-03-page45.txt: [('-gain', 'gain'), ('dark-', 'dark')]
TMM19020301-V14-03-page47.txt: [('-pressed', 'pressed')]
TMM19020301-V14-03-page49.txt: [('-', '')]
TMM19020301-V14-03-page5.txt: [('A-', 'A')]
TMM19020301-V14-03-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020301-V14-03-page51.txt: [('Foun-', 'Foun'), ('-', ''), ('-inch', 'inch'), ('-inch', 'inch'), ('-in.', 'in.'), ('-in.', 'in.'), ('-', '')]
TMM19020301-V14-03-page52.txt: [('-', ''), ('-', '')]
TMM19020301-V14-03-page7.txt: [('---', '--'), ('-.', '.'), ('-i', 'i'), ('-.', '.'), ('-----', '----'), ('-', ''), ('.-', '.'), ('----', '---'), ('-', ''), ("--'---", "-'---"), ('-', ''), ('-', ''), ('-', ''), ('f--', 'f-')]
TMM19020301-V14-03-page8.txt: [('-', '')]
TMM19020301-V14-03-page9.txt: [('--or', '-or')]
TMM19020401-V14-04-page1.txt: [('-.', '.'), ('.CONTENTSib-', '.CONTENTSib'), ('-', ''), ('-', '')]
TMM19020401-V14-04-page11.txt: [('-strong', 'strong')]
TMM19020401-V14-04-page13.txt: [('-', '')]
TMM19020401-V14-04-page16.txt: [('p-a--', 'p-a-'), ('-arr', 'arr'), ('-', ''), ('-e', 'e'), ('-', ''), ('-', ''), ("--C'rA", "-C'rA"), ('-X', 'X')]
TMM19020401-V14-04-page2.txt: [('-', '')]
TMM19020401-V14-04-page24.txt: [('re-', 're')]
TMM19020401-V14-04-page3.txt: [('RAIL-', 'RAIL'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020401-V14-04-page34.txt: [('-', ''), ('expe-', 'expe')]
TMM19020401-V14-04-page43.txt: [('institu-', 'institu')]
TMM19020401-V14-04-page46.txt: [('Brother-', 'Brother')]
TMM19020401-V14-04-page49.txt: [('accord-', 'accord')]
TMM19020401-V14-04-page50.txt: [('SECOND-', 'SECOND'), ('-', '')]
TMM19020401-V14-04-page52.txt: [('-.', '.')]
TMM19020401-V14-04-page7.txt: [('At-', 'At'), ('-rtitxm', 'rtitxm'), ('kk-t-', 'kk-t'), ('-', ''), ('.z-', '.z'), ('wt-', 'wt'), ('m-', 'm'), ('-', ''), ('t-', 't')]
TMM19020401-V14-04-page8.txt: [('--', '-')]
TMM19020501-V14-05-page1.txt: [('.-', '.'), ('-Jamaica', 'Jamaica'), ('-', ''), ('-', ''), ('-.', '.')]
TMM19020501-V14-05-page10.txt: [('sta-', 'sta')]
TMM19020501-V14-05-page11.txt: [('-', ''), ('suc-', 'suc')]
TMM19020501-V14-05-page13.txt: [('moun-', 'moun')]
TMM19020501-V14-05-page16.txt: [('-in', 'in')]
TMM19020501-V14-05-page17.txt: [('mis-', 'mis')]
TMM19020501-V14-05-page2.txt: [('-', ''), ('AS--', 'AS-'), ('-OUR', 'OUR'), ('-gives', 'gives')]
TMM19020501-V14-05-page24.txt: [('-first-day', 'first-day'), ('-', '')]
TMM19020501-V14-05-page25.txt: [('-', ''), ('-', '')]
TMM19020501-V14-05-page27.txt: [('out-', 'out')]
TMM19020501-V14-05-page3.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('RAIL-', 'RAIL')]
TMM19020501-V14-05-page31.txt: [('in-', 'in')]
TMM19020501-V14-05-page35.txt: [('Method-', 'Method')]
TMM19020501-V14-05-page36.txt: [('-', '')]
TMM19020501-V14-05-page4.txt: [('-', '')]
TMM19020501-V14-05-page40.txt: [('experi-', 'experi')]
TMM19020501-V14-05-page42.txt: [('-us', 'us')]
TMM19020501-V14-05-page43.txt: [('LATER.-', 'LATER.')]
TMM19020501-V14-05-page45.txt: [('ex-', 'ex')]
TMM19020501-V14-05-page46.txt: [('-', ''), ('Con-', 'Con')]
TMM19020501-V14-05-page47.txt: [('-', ''), ('Relief.-', 'Relief.'), ('Sanatorium.-', 'Sanatorium.'), ('-', ''), ('Tithe.-', 'Tithe.'), ('Conference.-', 'Conference.'), ('Mission.-', 'Mission.'), ('Conference.-', 'Conference.'), ('-', ''), ('-', ''), ('.-', '.'), ('-', ''), ('Mission.-', 'Mission.'), ('Italy.-', 'Italy.'), ('-', ''), ('-', ''), ('Mission.-', 'Mission.'), ('-', ''), ('Mission.-', 'Mission.'), ('-', ''), ('-California', 'California'), ('-', ''), ('Conference.-', 'Conference.'), ('Conference.-', 'Conference.'), ('-', ''), ('-Iowa', 'Iowa'), ('-', ''), ('-Minnesota', 'Minnesota')]
TMM19020501-V14-05-page48.txt: [('-', ''), ('-', ''), ('-', ''), ('Fund.-', 'Fund.'), ('-', ''), ('-', ''), ('BENEV-', 'BENEV'), ('-', '')]
TMM19020501-V14-05-page5.txt: [('I-', 'I')]
TMM19020501-V14-05-page50.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
TMM19020501-V14-05-page51.txt: [('or.-', 'or.'), ('-wl', 'wl'), ('--', '-')]
TMM19020501-V14-05-page52.txt: [('-', ''), ('-', '')]
TMM19020501-V14-05-page6.txt: [('key-', 'key')]
TMM19020501-V14-05-page7.txt: [('MAGA-', 'MAGA')]
TMM19020501-V14-05-page8.txt: [('pro-', 'pro')]
In [30]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction3

Average verified rate: 0.9839672985814993

Average of error rates: 0.023232207792207794

Total token count: 861614

In [31]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[31]:
[("'", 583),
 ('e', 484),
 ('w', 476),
 ('m', 341),
 ('t', 326),
 ('r', 309),
 ('d', 302),
 ('n', 298),
 ('f', 269),
 ('g', 250),
 ('th', 109),
 ('x', 75),
 ('co', 70),
 ('k', 66),
 ('pa', 64),
 ('u', 64),
 ('z', 61),
 ('mis', 42),
 ('io', 42),
 ('oc', 40),
 ('oo', 33),
 ('cc', 29),
 ('sionary', 29),
 ('re', 25),
 ('al', 23),
 ("'the", 23),
 ('q', 22),
 ('mt', 20),
 ('hausaland', 19),
 ('id', 19),
 ("''", 19),
 ('stauffer', 19),
 ('ary', 19),
 ('basle', 18),
 ('zo', 18),
 ('ft', 18),
 ('mo', 18),
 ('couva', 17),
 ('kalaka', 17),
 ('hasegawa', 17),
 ('sul', 17),
 ('okohira', 16),
 ('ro', 16),
 ('pp', 15),
 ('helsingfors', 15),
 ('sabbathschool', 15),
 ("hours'", 15),
 ('te', 15),
 ('schwantes', 15),
 ('raiatea', 15)]

Correction 4 -- Remove Extra Quotation Marks

In [33]:
# %load shared_elements/remove_extra_quotation_marks.py
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
        print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
TMM18980101-V10-01-page11.txt: [("'tis", 'tis')]
TMM18980101-V10-01-page12.txt: [("'his", 'his')]
TMM18980101-V10-01-page13.txt: [("'nough", 'nough')]
TMM18980101-V10-01-page16.txt: [("'The", 'The')]
TMM18980101-V10-01-page3.txt: [("'Redeemer", 'Redeemer')]
TMM18980101-V10-01-page4.txt: [("'YWVP", 'YWVP')]
TMM18980101-V10-01-page5.txt: [("'the", 'the'), ("'which", 'which'), ("'sphere", 'sphere')]
TMM18980201-V10-02-page14.txt: [("'my", 'my')]
TMM18980201-V10-02-page34.txt: [("'Contemplated", 'Contemplated')]
TMM18980301-V10-03-page6.txt: [("'t", 't')]
TMM18980401-V10-04-page12.txt: [("'put", 'put')]
TMM18980401-V10-04-page13.txt: [("'a", 'a')]
TMM18980401-V10-04-page17.txt: [("'The", 'The')]
TMM18980401-V10-04-page22.txt: [("'one", 'one'), ("'of", 'of')]
TMM18980401-V10-04-page31.txt: [("'s", 's'), ("'s", 's'), ("'s", 's'), ("'out", 'out')]
TMM18980401-V10-04-page4.txt: [("'S", 'S')]
TMM18980501-V10-05-page14.txt: [("'appreciate", 'appreciate')]
TMM18980501-V10-05-page28.txt: [("'g", 'g'), ("'f", 'f')]
TMM18980501-V10-05-page31.txt: [("'HYATT.", 'HYATT.')]
TMM18980501-V10-05-page40.txt: [("'God", 'God')]
TMM18980501-V10-05-page7.txt: [("'tis", 'tis')]
TMM18980601-V10-06-page20.txt: [("'.", '.'), ("'I", 'I'), ("'t", 't'), ("'..", '..')]
TMM18980601-V10-06-page25.txt: [("'Ye", 'Ye')]
TMM18980601-V10-06-page26.txt: [("'Here", 'Here')]
TMM18980601-V10-06-page32.txt: [("'countries.", 'countries.')]
TMM18980601-V10-06-page36.txt: [("'business", 'business')]
TMM18980601-V10-06-page5.txt: [("'forward", 'forward')]
TMM18980701-V10-07-page23.txt: [("'some", 'some')]
TMM18980701-V10-07-page25.txt: [("'the", 'the')]
TMM18980701-V10-07-page38.txt: [("'.", '.')]
TMM18980701-V10-07-page39.txt: [("'Signs", 'Signs')]
TMM18980701-V10-07-page40.txt: [("'Signs", 'Signs')]
TMM18980701-V10-07-page7.txt: [("'AlI", 'AlI')]
TMM18980801-V10-08-page11.txt: [("'them", 'them')]
TMM18980801-V10-08-page30.txt: [("'Come", 'Come')]
TMM18980801-V10-08-page31.txt: [("'is", 'is')]
TMM18980901-V10-09-page35.txt: [("'M.", 'M.')]
TMM18980901-V10-09-page7.txt: [("'Praise", 'Praise'), ("'Make", 'Make'), ("'I", 'I')]
TMM18981001-V10-10-page11.txt: [("'Reformation", 'Reformation'), ("'of", 'of')]
TMM18981001-V10-10-page13.txt: [("'at", 'at')]
TMM18981001-V10-10-page15.txt: [("'villas", 'villas')]
TMM18981001-V10-10-page21.txt: [("'Salvation", 'Salvation')]
TMM18981001-V10-10-page37.txt: [("'il", 'il'), ("'A", 'A')]
TMM18981001-V10-10-page6.txt: [("'eternal", 'eternal')]
TMM18981101-V10-11-page15.txt: [("'divine", 'divine')]
TMM18981101-V10-11-page17.txt: [("'Great", 'Great')]
TMM18981101-V10-11-page18.txt: [("'foreigner", 'foreigner'), ("'Corn", 'Corn'), ("'Great", 'Great')]
TMM18981101-V10-11-page22.txt: [("'native", 'native')]
TMM18981101-V10-11-page26.txt: [("'new", 'new')]
TMM18981101-V10-11-page32.txt: [('\'Creature."', 'Creature."')]
TMM18981101-V10-11-page37.txt: [("'.", '.')]
TMM18981101-V10-11-page5.txt: [("'authority", 'authority')]
TMM18981101-V10-11-page9.txt: [("'a", 'a')]
TMM18981201-V10-12-page10.txt: [("'no", 'no')]
TMM18981201-V10-12-page11.txt: [("'not", 'not')]
TMM18981201-V10-12-page31.txt: [("'he", 'he')]
TMM18981201-V10-12-page4.txt: [("'the", 'the')]
TMM18981201-V10-12-page41.txt: [("'tis", 'tis'), ("'twill", 'twill')]
TMM18981201-V10-12-page43.txt: [("'liberty", 'liberty')]
TMM18981201-V10-12-page7.txt: [("'Nile", 'Nile')]
TMM18990101-V11-01-page14.txt: [("'peons", 'peons'), ("'The'gold", 'Thegold'), ("'trees", 'trees')]
TMM18990101-V11-01-page20.txt: [("'our", 'our')]
TMM18990101-V11-01-page34.txt: [("'southeastern", 'southeastern')]
TMM18990101-V11-01-page39.txt: [("'If", 'If')]
TMM18990101-V11-01-page41.txt: [("'why", 'why')]
TMM18990101-V11-01-page47.txt: [("'Bible", 'Bible')]
TMM18990201-V11-02-page11.txt: [("'Battle", 'Battle')]
TMM18990201-V11-02-page31.txt: [("'Gather", 'Gather')]
TMM18990201-V11-02-page49.txt: [("'whom", 'whom')]
TMM18990201-V11-02-page51.txt: [("'or", 'or')]
TMM18990201-V11-02-page55.txt: [("'i", 'i'), ("'I", 'I')]
TMM18990301-V11-03-page14.txt: [("'the", 'the')]
TMM18990301-V11-03-page16.txt: [("'best", 'best')]
TMM18990301-V11-03-page26.txt: [("'He", 'He'), ("'is", 'is')]
TMM18990301-V11-03-page28.txt: [("'to", 'to')]
TMM18990301-V11-03-page37.txt: [("'acquainted", 'acquainted')]
TMM18990401-V11-04-page1.txt: [("'VOL.", 'VOL.')]
TMM18990401-V11-04-page3.txt: [("'hands", 'hands'), ("'Of", 'Of'), ("'Mercy", 'Mercy'), ("'drawn", 'drawn'), ("'culminate", 'culminate')]
TMM18990401-V11-04-page7.txt: [("'with", 'with')]
TMM18990501-V11-05-page35.txt: [("'cause", 'cause')]
TMM18990501-V11-05-page47.txt: [("'i", 'i')]
TMM18990601-V11-06-page10.txt: [("'very", 'very')]
TMM18990601-V11-06-page11.txt: [("'one", 'one'), ("'each", 'each')]
TMM18990601-V11-06-page12.txt: [("'produce", 'produce')]
TMM18990601-V11-06-page24.txt: [("'the", 'the')]
TMM18990601-V11-06-page29.txt: [("'northwest", 'northwest')]
TMM18990601-V11-06-page31.txt: [("'another", 'another')]
TMM18990701-V11-07-page11.txt: [("'Central", 'Central')]
TMM18990701-V11-07-page17.txt: [("'These", 'These')]
TMM18990701-V11-07-page2.txt: [("'The", 'The'), ("'God", 'God')]
TMM18990701-V11-07-page25.txt: [("'Church", 'Church')]
TMM18990701-V11-07-page33.txt: [("'Tis", 'Tis')]
TMM18990701-V11-07-page37.txt: [("'baptism", 'baptism')]
TMM18990701-V11-07-page39.txt: [("'Surely", 'Surely'), ("'Why", 'Why')]
TMM18990701-V11-07-page4.txt: [("'China", 'China'), ("'will", 'will')]
TMM18990701-V11-07-page40.txt: [("'Germany", 'Germany')]
TMM18990701-V11-07-page42.txt: [("'We", 'We')]
TMM18990701-V11-07-page46.txt: [("'row", 'row')]
TMM18990701-V11-07-page47.txt: [("''ettntIV", 'ettntIV')]
TMM18990801-V11-08-page11.txt: [("'R.", 'R.'), ("'.", '.'), ("'ft", 'ft'), ("'.", '.'), ("'.", '.'), ("'.", '.'), ("'C", 'C'), ("'RIM", 'RIM'), ("'iOnNA", 'iOnNA'), ("'Ct.", 'Ct.'), ("'cc", 'cc'), ("'..", '..'), ("'L.", 'L.'), ("'W", 'W'), ("'Mg", 'Mg'), ("'t", 't'), ("'.", '.'), ("'Co", 'Co'), ("'lgl", 'lgl'), ("'rt", 'rt'), ("'cCc", 'cCc')]
TMM18990801-V11-08-page37.txt: [("'i'Selected", 'iSelected')]
TMM18990901-V11-09-page1.txt: [("'ve", 've')]
TMM18990901-V11-09-page12.txt: [("'that", 'that')]
TMM18990901-V11-09-page25.txt: [("'Well", 'Well')]
TMM18990901-V11-09-page42.txt: [("'The", 'The')]
TMM18990901-V11-09-page43.txt: [("'Send", 'Send'), ("'comprehend", 'comprehend'), ("'Neglected", 'Neglected'), ("'Here", 'Here')]
TMM18991001-V11-10-page11.txt: [("'and", 'and'), ("'more", 'more')]
TMM18991001-V11-10-page22.txt: [("'to", 'to')]
TMM18991001-V11-10-page27.txt: [("'mission", 'mission')]
TMM18991001-V11-10-page30.txt: [("'licensed", 'licensed')]
TMM18991001-V11-10-page4.txt: [("'to", 'to'), ("'selected", 'selected')]
TMM18991001-V11-10-page41.txt: [("'giving", 'giving')]
TMM18991001-V11-10-page42.txt: [("'twelve", 'twelve')]
TMM18991001-V11-10-page43.txt: [("'How", 'How')]
TMM18991001-V11-10-page45.txt: [("'Stich", 'Stich')]
TMM18991101-V11-11-page23.txt: [("'...", '...'), ("'Z", 'Z')]
TMM18991101-V11-11-page24.txt: [("'Many", 'Many'), ("'going", 'going')]
TMM18991101-V11-11-page7.txt: [("'is", 'is')]
TMM18991201-V11-12-page24.txt: [("'filled", 'filled')]
TMM18991201-V11-12-page28.txt: [("'presidents", 'presidents')]
TMM18991201-V11-12-page31.txt: [("'Whosoever", 'Whosoever')]
TMM18991201-V11-12-page40.txt: [("'summer's", 'summers')]
TMM18991201-V11-12-page45.txt: [("'benefited", 'benefited')]
TMM19000101-V12-01-page1.txt: [("'GREECE", 'GREECE')]
TMM19000101-V12-01-page10.txt: [("'qie", 'qie'), ("'o", 'o')]
TMM19000101-V12-01-page29.txt: [("'blue", 'blue')]
TMM19000101-V12-01-page31.txt: [("'climbing", 'climbing')]
TMM19000101-V12-01-page32.txt: [("'twere", 'twere')]
TMM19000101-V12-01-page34.txt: [("'We", 'We')]
TMM19000101-V12-01-page35.txt: [("'I", 'I')]
TMM19000101-V12-01-page46.txt: [("'make", 'make')]
TMM19000101-V12-01-page48.txt: [("'Thou", 'Thou'), ("'No", 'No')]
TMM19000101-V12-01-page52.txt: [("'BILLS", 'BILLS'), ("'TRANSFER", 'TRANSFER')]
TMM19000201-V12-02-page1.txt: [("'THE", 'THE')]
TMM19000201-V12-02-page30.txt: [("'Shall", 'Shall')]
TMM19000201-V12-02-page39.txt: [("'the", 'the')]
TMM19000201-V12-02-page5.txt: [('\'"and', '"and')]
TMM19000201-V12-02-page51.txt: [("'Milk", 'Milk'), ("'COTTItEri", 'COTTItEri'), ("'BrOOkbtli", 'BrOOkbtli'), ("'clean", 'clean')]
TMM19000301-V12-03-page17.txt: [("'Go", 'Go')]
TMM19000301-V12-03-page2.txt: [("'God", 'God')]
TMM19000301-V12-03-page31.txt: [("'Well", 'Well')]
TMM19000301-V12-03-page34.txt: [("'not", 'not'), ("'powers", 'powers')]
TMM19000301-V12-03-page36.txt: [("'Here", 'Here')]
TMM19000301-V12-03-page47.txt: [("'upon", 'upon'), ("'Come", 'Come')]
TMM19000301-V12-03-page48.txt: [("'well", 'well')]
TMM19000301-V12-03-page5.txt: [("'the", 'the')]
TMM19000301-V12-03-page9.txt: [("''j", 'j')]
TMM19000401-V12-04-page15.txt: [("'I", 'I')]
TMM19000401-V12-04-page16.txt: [("'centuries", 'centuries')]
TMM19000401-V12-04-page29.txt: [("'suppose", 'suppose')]
TMM19000401-V12-04-page33.txt: [("'Christ", 'Christ')]
TMM19000401-V12-04-page51.txt: [("'s", 's'), ("'our", 'our'), ("'Come", 'Come')]
TMM19000401-V12-04-page52.txt: [("'Tis", 'Tis')]
TMM19000501-V12-05-page1.txt: [("'CIRCLE", 'CIRCLE')]
TMM19000501-V12-05-page2.txt: [("'lasso", 'lasso')]
TMM19000501-V12-05-page32.txt: [("'Lord", 'Lord')]
TMM19000501-V12-05-page39.txt: [("'time", 'time')]
TMM19000501-V12-05-page42.txt: [("'Years", 'Years')]
TMM19000501-V12-05-page51.txt: [("'lath", 'lath'), ("'that", 'that'), ("'si", 'si'), ("'The", 'The'), ("'because", 'because')]
TMM19000501-V12-05-page52.txt: [("'Tis", 'Tis')]
TMM19000501-V12-05-page6.txt: [("'Tis", 'Tis')]
TMM19000601-V12-06-page1.txt: [("'A", 'A'), ("'Nassau", 'Nassau')]
TMM19000601-V12-06-page11.txt: [("'Come", 'Come')]
TMM19000601-V12-06-page14.txt: [("'little", 'little')]
TMM19000601-V12-06-page52.txt: [("'Tis", 'Tis')]
TMM19000701-V12-07-page10.txt: [("'The", 'The')]
TMM19000701-V12-07-page12.txt: [("'If", 'If'), ("'Ever", 'Ever')]
TMM19000701-V12-07-page13.txt: [("'of", 'of')]
TMM19000701-V12-07-page46.txt: [("'Well", 'Well')]
TMM19000701-V12-07-page5.txt: [("'twas", 'twas')]
TMM19000701-V12-07-page52.txt: [("'Tis", 'Tis')]
TMM19000701-V12-07-page7.txt: [("'ping", 'ping')]
TMM19000801-V12-08-page11.txt: [("'to", 'to')]
TMM19000801-V12-08-page13.txt: [("'Mohammedan", 'Mohammedan')]
TMM19000801-V12-08-page2.txt: [("'mw", 'mw')]
TMM19000801-V12-08-page22.txt: [("'The", 'The')]
TMM19000801-V12-08-page23.txt: [("'Sufficient", 'Sufficient')]
TMM19000801-V12-08-page24.txt: [("'Have", 'Have')]
TMM19000801-V12-08-page29.txt: [("'luminated", 'luminated')]
TMM19000801-V12-08-page36.txt: [("'In", 'In')]
TMM19000801-V12-08-page41.txt: [("'Casting", 'Casting'), ("'Cast", 'Cast'), ("'Hast", 'Hast'), ("'Commit", 'Commit'), ("'are", 'are')]
TMM19000801-V12-08-page42.txt: [("'Underneath", 'Underneath')]
TMM19000801-V12-08-page47.txt: [("'United", 'United')]
TMM19000801-V12-08-page52.txt: [("'Tis", 'Tis'), ("'enA", 'enA')]
TMM19000801-V12-08-page6.txt: [("'Behold", 'Behold')]
TMM19000901-V12-09-page12.txt: [("'For", 'For'), ("'The", 'The'), ("'for", 'for'), ("'For", 'For'), ("'The", 'The'), ("'For", 'For'), ("'The", 'The'), ("'Though", 'Though')]
TMM19000901-V12-09-page16.txt: [("'hall", 'hall')]
TMM19000901-V12-09-page43.txt: [("'These", 'These')]
TMM19000901-V12-09-page45.txt: [("'T", 'T')]
TMM19000901-V12-09-page49.txt: [("'is", 'is')]
TMM19000901-V12-09-page52.txt: [("'for", 'for'), ("'Tis", 'Tis')]
TMM19000901-V12-09-page6.txt: [("'send", 'send'), ("'missionary", 'missionary')]
TMM19000901-V12-09-page7.txt: [("'missionary", 'missionary')]
TMM19001001-V12-10-page1.txt: [("'PERFECT", 'PERFECT'), ("'CHINA", 'CHINA'), ("'INDEPENDENCE", 'INDEPENDENCE'), ("'I.", 'I.'), ("'LETTERS", 'LETTERS'), ("'PUBLISHED", 'PUBLISHED')]
TMM19001001-V12-10-page16.txt: [("'When", 'When')]
TMM19001001-V12-10-page18.txt: [("'Pearly", 'Pearly')]
TMM19001001-V12-10-page19.txt: [("'way", 'way'), ("'Old", 'Old'), ("'All", 'All'), ("'Reason", 'Reason')]
TMM19001001-V12-10-page49.txt: [("'The", 'The'), ("'are", 'are'), ("'knows", 'knows')]
TMM19001001-V12-10-page51.txt: [("'York.", 'York.')]
TMM19001001-V12-10-page52.txt: [("'Tie", 'Tie'), ("'Latest", 'Latest')]
TMM19001101-V12-11-page16.txt: [("'the", 'the')]
TMM19001101-V12-11-page18.txt: [("'professional", 'professional')]
TMM19001101-V12-11-page47.txt: [("'Why", 'Why')]
TMM19001101-V12-11-page51.txt: [("'roof", 'roof'), ("'Brooklyn", 'Brooklyn'), ("'without", 'without'), ("'for", 'for'), ("'under", 'under'), ("'Venial..", 'Venial..')]
TMM19001101-V12-11-page52.txt: [("'Seventh", 'Seventh'), ("'fine", 'fine'), ("'not", 'not')]
TMM19001101-V12-11-page6.txt: [("'he", 'he'), ("'Cast", 'Cast'), ("'Cast", 'Cast')]
TMM19001201-V12-12-page14.txt: [("'la", 'la')]
TMM19001201-V12-12-page2.txt: [("'UNDERWOOD", 'UNDERWOOD')]
TMM19001201-V12-12-page22.txt: [("'house", 'house')]
TMM19001201-V12-12-page28.txt: [("'events.", 'events.')]
TMM19001201-V12-12-page3.txt: [("'i", 'i')]
TMM19001201-V12-12-page49.txt: [("'Volunteer", 'Volunteer')]
TMM19001201-V12-12-page51.txt: [("'York.", 'York.')]
TMM19001201-V12-12-page52.txt: [("'Tie", 'Tie')]
TMM19001201-V12-12-page9.txt: [("'as", 'as')]
TMM19020101-V14-01-page10.txt: [("'onechapter", 'onechapter')]
TMM19020101-V14-01-page13.txt: [("'neath", 'neath'), ("'Tis", 'Tis'), ("'Tis", 'Tis')]
TMM19020101-V14-01-page2.txt: [("'N", 'N')]
TMM19020101-V14-01-page34.txt: [("'disease.", 'disease.')]
TMM19020201-V14-02-page37.txt: [('\'"', '"')]
TMM19020201-V14-02-page40.txt: [("'verse", 'verse')]
TMM19020201-V14-02-page42.txt: [("'at", 'at')]
TMM19020201-V14-02-page49.txt: [("'Jo", 'Jo')]
TMM19020201-V14-02-page50.txt: [("'act", 'act'), ("'For", 'For')]
TMM19020201-V14-02-page51.txt: [("'stigmatized", 'stigmatized')]
TMM19020201-V14-02-page52.txt: [("'Scenery", 'Scenery')]
TMM19020301-V14-03-page1.txt: [("'The", 'The'), ("'illustrated", 'illustrated'), ("'Japan", 'Japan'), ("'Among", 'Among'), ("'.", '.'), ("'MISSION", 'MISSION'), ("'.", '.'), ("'The", 'The'), ("'.", '.'), ("'Children", 'Children')]
TMM19020301-V14-03-page10.txt: [("'will", 'will')]
TMM19020301-V14-03-page2.txt: [("'ckroutes", 'ckroutes'), ("'.", '.'), ("'it", 'it'), ("'ST", 'ST'), ("'CIMAGO", 'CIMAGO'), ("'State", 'State')]
TMM19020301-V14-03-page25.txt: [("'quarantine", 'quarantine')]
TMM19020301-V14-03-page26.txt: [("'distant.", 'distant.')]
TMM19020301-V14-03-page40.txt: [("'has", 'has')]
TMM19020301-V14-03-page42.txt: [("'diet", 'diet')]
TMM19020301-V14-03-page44.txt: [("'and", 'and')]
TMM19020301-V14-03-page8.txt: [("'patient.", 'patient.')]
TMM19020401-V14-04-page12.txt: [("'effigy", 'effigy')]
TMM19020401-V14-04-page15.txt: [("'Tis", 'Tis')]
TMM19020401-V14-04-page19.txt: [("'the", 'the')]
TMM19020401-V14-04-page39.txt: [("'Therefore", 'Therefore')]
TMM19020401-V14-04-page46.txt: [("'down", 'down')]
TMM19020401-V14-04-page51.txt: [("'Tremont", 'Tremont')]
TMM19020401-V14-04-page52.txt: [("'OW", 'OW')]
TMM19020501-V14-05-page14.txt: [("'had", 'had')]
TMM19020501-V14-05-page2.txt: [("'CABINET", 'CABINET')]
TMM19020501-V14-05-page24.txt: [("'from", 'from')]
TMM19020501-V14-05-page31.txt: [("'But", 'But')]
TMM19020501-V14-05-page32.txt: [("'only", 'only')]
TMM19020501-V14-05-page37.txt: [("'come", 'come')]
TMM19020501-V14-05-page51.txt: [("'.", '.'), ("'....", '....'), ("'details", 'details')]
TMM19020501-V14-05-page6.txt: [("'love.", 'love.')]
In [36]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction4

Average verified rate: 0.9845119517865869

Average of error rates: 0.022646753246753245

Total token count: 861503

In [37]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[37]:
[('e', 485),
 ('w', 476),
 ("'", 466),
 ('t', 344),
 ('m', 343),
 ('r', 313),
 ('d', 303),
 ('n', 302),
 ('f', 271),
 ('g', 254),
 ('th', 110),
 ('x', 76),
 ('co', 70),
 ('k', 68),
 ('pa', 64),
 ('u', 64),
 ('z', 63),
 ('mis', 42),
 ('io', 42),
 ('oc', 40),
 ('oo', 33),
 ('cc', 31),
 ('sionary', 29),
 ('re', 25),
 ('al', 23),
 ('q', 22),
 ('mt', 20),
 ('hausaland', 19),
 ('id', 19),
 ('ft', 19),
 ('stauffer', 19),
 ('ary', 19),
 ('basle', 18),
 ('zo', 18),
 ('mo', 18),
 ('couva', 17),
 ('kalaka', 17),
 ('hasegawa', 17),
 ('sul', 17),
 ('okohira', 16),
 ('ro', 16),
 ('pp', 15),
 ('helsingfors', 15),
 ('sabbathschool', 15),
 ("hours'", 15),
 ('te', 15),
 ('schwantes', 15),
 ('raiatea', 15),
 ('wm', 15),
 ('ioo', 14)]

Correction 5 -- Rejoin Split Words

In [39]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
TMM18980101-V10-01-page1.txt: [('Mis', 'SIONARY')]
TMM18980101-V10-01-page31.txt: [('ro', 'of')]
TMM18980101-V10-01-page4.txt: [('fil', 'A')]
TMM18980201-V10-02-page37.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18980201-V10-02-page38.txt: [('zo', 'o')]
TMM18980201-V10-02-page39.txt: [('SECRE', 'TARIES')]
TMM18980201-V10-02-page9.txt: [('Ning', 'po')]
TMM18980301-V10-03-page24.txt: [('Mis', 'SIONARY')]
TMM18980301-V10-03-page25.txt: [('es', 'd'), ('mi', 'o'), ('por', 'no'), ('Aqui', 'no')]
TMM18980301-V10-03-page39.txt: [('Mis', 'SION')]
TMM18980301-V10-03-page6.txt: [('Fi', 'le')]
TMM18980301-V10-03-page8.txt: [('Mc', 'Carthy')]
TMM18980401-V10-04-page26.txt: [('HISTOR', 'ICAL')]
TMM18980401-V10-04-page3.txt: [('G.', '')]
TMM18980401-V10-04-page36.txt: [('pais', 'a')]
TMM18980401-V10-04-page7.txt: [("KING'", 'S')]
TMM18980501-V10-05-page33.txt: [("God'", 's')]
TMM18980501-V10-05-page35.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18980501-V10-05-page39.txt: [('re', 'leased')]
TMM18980601-V10-06-page16.txt: [('sr', 'A')]
TMM18980801-V10-08-page35.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18980801-V10-08-page38.txt: [('MIS', 'SIONARY')]
TMM18980901-V10-09-page17.txt: [('wh', 'en')]
TMM18981001-V10-10-page28.txt: [('Previo', 'us')]
TMM18981101-V10-11-page17.txt: [('MIS', 'SIONARY')]
TMM18981101-V10-11-page20.txt: [('Tien', 'Tsin')]
TMM18981101-V10-11-page25.txt: [('MAGA', 'ZINE')]
TMM18981101-V10-11-page36.txt: [('SOCI', 'ETY')]
TMM18981101-V10-11-page37.txt: [('MAGA', 'ZINE')]
TMM18981101-V10-11-page38.txt: [('MAGA', 'ZINE')]
TMM18981201-V10-12-page41.txt: [('wa', 'y'), ('MC', 'CARTHY')]
TMM18990101-V11-01-page14.txt: [('es', 'Pecially')]
TMM18990101-V11-01-page29.txt: [('MIS', 'SIONARY')]
TMM18990101-V11-01-page47.txt: [('PHILADEL', 'PHIA')]
TMM18990101-V11-01-page48.txt: [('repre', 'sentative')]
TMM18990201-V11-02-page14.txt: [("Angel'", 's')]
TMM18990201-V11-02-page52.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18990201-V11-02-page54.txt: [('PHILADEL', 'PHIA')]
TMM18990301-V11-03-page11.txt: [('tri', 'weekly')]
TMM18990301-V11-03-page34.txt: [('G.', '')]
TMM18990301-V11-03-page38.txt: [('PHILADEL', 'PHIA')]
TMM18990401-V11-04-page36.txt: [('th', 'or')]
TMM18990401-V11-04-page38.txt: [('PHILADEL', 'PHIA'), ('Indo', 'China')]
TMM18990401-V11-04-page39.txt: [('repre', 'sentative')]
TMM18990501-V11-05-page42.txt: [('MIS', 'SION'), ('QUAR', 'TER')]
TMM18990501-V11-05-page46.txt: [('PHILADEL', 'PHIA')]
TMM18990601-V11-06-page46.txt: [('PHILADEL', 'PHIA')]
TMM18990701-V11-07-page26.txt: [('civiliz', 'ation')]
TMM18990701-V11-07-page27.txt: [('so-', 'called')]
TMM18990701-V11-07-page41.txt: [('longsuffer', 'ing')]
TMM18990701-V11-07-page46.txt: [('MIS', 'SIONARY')]
TMM18990801-V11-08-page11.txt: [('PC', 't'), ('al', 'i'), ('CA', 'W'), ('RI', 'a'), ('TE', 'R'), ('ato', 'N'), ('re', 'C'), ('ma', 'I'), ('te', 'a'), ('tAl', 'a'), ('JU', 'N'), ('Ele', 'a'), ('EV', 'I'), ('CI', 'T')]
TMM18990801-V11-08-page22.txt: [('IL', 'A')]
TMM18990801-V11-08-page34.txt: [('Philadel', 'phia')]
TMM18990801-V11-08-page45.txt: [('MIS', 'SION')]
TMM18990801-V11-08-page46.txt: [('MIS', 'SIONARY')]
TMM18990901-V11-09-page34.txt: [('co', 'laborer')]
TMM18990901-V11-09-page35.txt: [('mis', 'sionary')]
TMM18990901-V11-09-page46.txt: [('MIS', 'SIONARY')]
TMM18991001-V11-10-page17.txt: [('Mc', 'Carthy')]
TMM18991001-V11-10-page46.txt: [('MIS', 'SIONARY')]
TMM18991101-V11-11-page37.txt: [('MAGA', 'ZINE')]
TMM18991101-V11-11-page42.txt: [('MIS', 'SION')]
TMM18991101-V11-11-page44.txt: [('Indo', 'China')]
TMM18991101-V11-11-page46.txt: [('MIS', 'SIONARY'), ('Superin', 'tendent')]
TMM18991201-V11-12-page21.txt: [('MC', 'CARTHY')]
TMM18991201-V11-12-page31.txt: [('th', 'at')]
TMM18991201-V11-12-page38.txt: [('RE', 'V')]
TMM18991201-V11-12-page40.txt: [('Nebuchadn', "ezzar's")]
TMM18991201-V11-12-page46.txt: [('MIS', 'SIONARY')]
TMM19000101-V12-01-page10.txt: [('re', 'a')]
TMM19000101-V12-01-page37.txt: [('MAGA', 'ZINE')]
TMM19000101-V12-01-page44.txt: [('MIS', 'SIONARY')]
TMM19000101-V12-01-page47.txt: [('MAGA', 'ZINE')]
TMM19000101-V12-01-page50.txt: [('MIS', 'SIONARY'), ('MAGA', 'ZINE')]
TMM19000101-V12-01-page51.txt: [('reci', 'pes')]
TMM19000101-V12-01-page6.txt: [('un', 'INTELLIGENT')]
TMM19000201-V12-02-page36.txt: [('corre', 'sponding')]
TMM19000201-V12-02-page40.txt: [('Indo', 'China'), ('EM', 'It')]
TMM19000201-V12-02-page46.txt: [('MIS', 'SION')]
TMM19000201-V12-02-page50.txt: [('MIS', 'SIONARY')]
TMM19000201-V12-02-page51.txt: [('TRE', 'S')]
TMM19000201-V12-02-page52.txt: [('ig', 'n')]
TMM19000301-V12-03-page39.txt: [('pre', 'arrangement')]
TMM19000301-V12-03-page46.txt: [('ti', 'to')]
TMM19000401-V12-04-page50.txt: [('MIS', 'SIONARY')]
TMM19000401-V12-04-page51.txt: [('WA', 'RDS')]
TMM19000501-V12-05-page18.txt: [('CA', 'VINESS')]
TMM19000501-V12-05-page22.txt: [('re', 'no')]
TMM19000501-V12-05-page45.txt: [('MIS', 'SION')]
TMM19000501-V12-05-page50.txt: [('MIS', 'SIONARY')]
TMM19000501-V12-05-page52.txt: [('Li', 'Q')]
TMM19000601-V12-06-page21.txt: [('Ju', 'n')]
TMM19000601-V12-06-page45.txt: [('RE', 'VIEW')]
TMM19000601-V12-06-page49.txt: [('infor', 'mation')]
TMM19000601-V12-06-page50.txt: [('MIS', 'SIONARY')]
TMM19000701-V12-07-page40.txt: [('exac', 'test')]
TMM19000701-V12-07-page50.txt: [('MIS', 'SIONARY'), ('regula', 'rly')]
TMM19000701-V12-07-page52.txt: [('SIMPLICIT', 'Y')]
TMM19000801-V12-08-page12.txt: [('wh', 'o')]
TMM19000801-V12-08-page44.txt: [('MIS', 'SION')]
TMM19000801-V12-08-page45.txt: [('MA', 'TABELE')]
TMM19000801-V12-08-page47.txt: [('Tien', 'Tsin')]
TMM19000801-V12-08-page50.txt: [('MIS', 'SIONARY')]
TMM19000801-V12-08-page51.txt: [('VESTIBU', 'LED')]
TMM19000801-V12-08-page52.txt: [('SIMPLIC', 'ITY')]
TMM19000901-V12-09-page32.txt: [('MC', 'CARTHY')]
TMM19000901-V12-09-page5.txt: [('MC', 'CARTHY')]
TMM19000901-V12-09-page50.txt: [('MIS', 'SIONARY')]
TMM19001001-V12-10-page20.txt: [('MC', 'CARTHY')]
TMM19001001-V12-10-page38.txt: [('studen', 'ts')]
TMM19001001-V12-10-page44.txt: [('re', 'ct')]
TMM19001001-V12-10-page50.txt: [('MIS', 'SIONARY')]
TMM19001101-V12-11-page28.txt: [('MC', 'CARTHY')]
TMM19001101-V12-11-page44.txt: [('MIS', 'SION')]
TMM19001101-V12-11-page45.txt: [('RARATONG', 'A')]
TMM19001101-V12-11-page47.txt: [('fel', 'lows')]
TMM19001101-V12-11-page50.txt: [('MIS', 'SIONARY')]
TMM19001101-V12-11-page51.txt: [('BEW', 'ARE'), ('re', 'Price')]
TMM19001201-V12-12-page11.txt: [('magnif', 'icent')]
TMM19001201-V12-12-page18.txt: [('MC', 'CARTHY')]
TMM19001201-V12-12-page2.txt: [('co', 'mpany')]
TMM19001201-V12-12-page31.txt: [('wa', 's')]
TMM19001201-V12-12-page35.txt: [('re', 'reading')]
TMM19001201-V12-12-page47.txt: [('Ro', 'man')]
TMM19001201-V12-12-page50.txt: [('MIS', 'SIONARY')]
TMM19020101-V14-01-page43.txt: [('Guadalaj', 'ara')]
TMM19020201-V14-02-page17.txt: [('unf', 'allen')]
TMM19020201-V14-02-page33.txt: [('th', 'or')]
TMM19020201-V14-02-page48.txt: [('ASSO', 'CIATION')]
TMM19020201-V14-02-page49.txt: [('LOUISI', 'ANA'), ('NE', 'BRASKA')]
TMM19020201-V14-02-page8.txt: [('Tien', 'Tsin')]
TMM19020301-V14-03-page1.txt: [('EDITORI', 'AL')]
TMM19020301-V14-03-page16.txt: [('MC', 'CARTHY')]
TMM19020301-V14-03-page26.txt: [('th', 'o')]
TMM19020401-V14-04-page16.txt: [('pA', 'L')]
TMM19020401-V14-04-page47.txt: [('Sul', 'a')]
TMM19020401-V14-04-page52.txt: [('ma', 'M')]
TMM19020501-V14-05-page1.txt: [('TH', 'E')]
TMM19020501-V14-05-page11.txt: [('suc', 'cess')]
TMM19020501-V14-05-page2.txt: [('GA', 'g')]
TMM19020501-V14-05-page48.txt: [('oc', 'H'), ('BENEV', 'OLENT')]
TMM19020501-V14-05-page7.txt: [('MAGA', 'ZINE')]
In [42]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction5

Average verified rate: 0.984760022894214

Average of error rates: 0.022267532467532464

Total token count: 861353

In [43]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[43]:
[('e', 484),
 ('w', 476),
 ("'", 466),
 ('m', 342),
 ('t', 342),
 ('r', 312),
 ('d', 302),
 ('n', 300),
 ('f', 271),
 ('g', 254),
 ('th', 107),
 ('x', 76),
 ('co', 69),
 ('k', 68),
 ('u', 64),
 ('pa', 63),
 ('z', 63),
 ('io', 42),
 ('oc', 40),
 ('oo', 33),
 ('cc', 31),
 ('al', 21),
 ('q', 21),
 ('mt', 20),
 ('re', 20),
 ('hausaland', 19),
 ('id', 19),
 ('ary', 19),
 ('ft', 19),
 ('stauffer', 19),
 ('zo', 18),
 ('basle', 18),
 ('mo', 18),
 ('couva', 17),
 ('kalaka', 17),
 ('hasegawa', 17),
 ('sul', 17),
 ('okohira', 16),
 ('helsingfors', 15),
 ('pp', 15),
 ('sabbathschool', 15),
 ("hours'", 15),
 ('schwantes', 15),
 ('raiatea', 15),
 ('wm', 15),
 ('ro', 15),
 ('ioo', 14),
 ('seventhday', 14),
 ('ic', 14),
 ('te', 14)]

Correction 6 -- Rejoin Split Words II

In [45]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
TMM18980201-V10-02-page37.txt: [('V', 'ermont')]
TMM18980301-V10-03-page25.txt: [('No', 'se')]
TMM18980301-V10-03-page39.txt: [('MisSION', 'ARY')]
TMM18980501-V10-05-page33.txt: [('whole', 'heartedness')]
TMM18980601-V10-06-page16.txt: [('r', 'OW'), ('the', 'Re')]
TMM18980901-V10-09-page5.txt: [('sub', 'terranean')]
TMM18981001-V10-10-page28.txt: [('w', 'ork')]
TMM18981201-V10-12-page42.txt: [('u', 'tA')]
TMM18981201-V10-12-page45.txt: [('Character', 'istic')]
TMM18990101-V11-01-page14.txt: [('es', 'Pecially')]
TMM18990101-V11-01-page47.txt: [('MISSION', 'ARY')]
TMM18990101-V11-01-page48.txt: [('repre', 'sentative')]
TMM18990201-V11-02-page45.txt: [('o', 'ffer')]
TMM18990201-V11-02-page54.txt: [('MISSION', 'ARY')]
TMM18990301-V11-03-page34.txt: [('Num', 'ber')]
TMM18990301-V11-03-page38.txt: [('MISSION', 'ARY')]
TMM18990301-V11-03-page9.txt: [('r', 'oth')]
TMM18990401-V11-04-page38.txt: [('MISSION', 'ARY')]
TMM18990401-V11-04-page39.txt: [('repre', 'sentative')]
TMM18990501-V11-05-page22.txt: [('req', 'uirement')]
TMM18990501-V11-05-page26.txt: [('in', 'ti')]
TMM18990501-V11-05-page46.txt: [('MISSION', 'ARY')]
TMM18990601-V11-06-page46.txt: [('MISSION', 'ARY')]
TMM18990701-V11-07-page17.txt: [('a', 'ny')]
TMM18990701-V11-07-page5.txt: [('gov', "ernor's")]
TMM18990801-V11-08-page11.txt: [('a', 'tt'), ('t', 'il'), ('s', 'gt'), ('at', 'co'), ('a', 'te'), ('a', 'RIZ')]
TMM18990801-V11-08-page20.txt: [('p', 'ork')]
TMM18990901-V11-09-page3.txt: [('MISSION', 'ARY')]
TMM18991001-V11-10-page37.txt: [('a', 're')]
TMM18991001-V11-10-page46.txt: [('an', 'swers')]
TMM18991201-V11-12-page29.txt: [('ha', 've')]
TMM18991201-V11-12-page40.txt: [('Nebuchadn', "ezzar's")]
TMM19000101-V12-01-page43.txt: [('a', 'nd')]
TMM19000101-V12-01-page47.txt: [('MISSION', 'ARY')]
TMM19000101-V12-01-page52.txt: [('DEVELOP', 'MENT')]
TMM19000201-V12-02-page29.txt: [('a', 'nd')]
TMM19000201-V12-02-page36.txt: [('corre', 'sponding')]
TMM19000201-V12-02-page49.txt: [('Miss', 'IONARY')]
TMM19000201-V12-02-page51.txt: [('Mission', 'arY')]
TMM19000301-V12-03-page42.txt: [('my', 'thology')]
TMM19000301-V12-03-page45.txt: [('hard', 'ly')]
TMM19000301-V12-03-page8.txt: [('the', 'Ta')]
TMM19000401-V12-04-page25.txt: [('car', 'ried')]
TMM19000501-V12-05-page32.txt: [('MISSION', 'ARY')]
TMM19000501-V12-05-page33.txt: [('wonder', 'ful')]
TMM19000501-V12-05-page5.txt: [('second', 'ary')]
TMM19000601-V12-06-page10.txt: [('a', 'li')]
TMM19000601-V12-06-page11.txt: [('be', 'ng')]
TMM19000601-V12-06-page52.txt: [('the', 'Remin')]
TMM19000801-V12-08-page34.txt: [('to', 'Shiba')]
TMM19000801-V12-08-page5.txt: [('cent', 'uries')]
TMM19000901-V12-09-page51.txt: [('con', 'nection')]
TMM19001001-V12-10-page31.txt: [('a', 'nd')]
TMM19001001-V12-10-page44.txt: [('re', 'ct'), ('a', 're')]
TMM19001001-V12-10-page52.txt: [('A', 'IL')]
TMM19001001-V12-10-page6.txt: [('Am', 'alekites')]
TMM19001001-V12-10-page8.txt: [('MISSION', 'ARY')]
TMM19001101-V12-11-page51.txt: [('P', 'hiladelphia')]
TMM19001201-V12-12-page2.txt: [('co', 'mpany')]
TMM19001201-V12-12-page23.txt: [('k', 'eeping')]
TMM19020201-V14-02-page21.txt: [('MISSION', 'ARY')]
TMM19020201-V14-02-page31.txt: [('Aguas', 'Calientes')]
TMM19020201-V14-02-page48.txt: [('Bell', 'oc')]
TMM19020201-V14-02-page52.txt: [('CEN', 'TRAL'), ('R', 'IP')]
TMM19020301-V14-03-page2.txt: [('B', 'RA')]
TMM19020301-V14-03-page3.txt: [('CEN', 'TRAL')]
TMM19020301-V14-03-page34.txt: [('con', 'verts')]
TMM19020401-V14-04-page3.txt: [('CEN', 'TRAL')]
TMM19020501-V14-05-page17.txt: [('the', 'mis')]
TMM19020501-V14-05-page2.txt: [('a', 'GA')]
TMM19020501-V14-05-page3.txt: [('CEN', 'TRAL'), ('E', 'xcursion')]
TMM19020501-V14-05-page8.txt: [('pro', 'tection')]
In [48]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/TMM/correction6

Average verified rate: 0.9848273395829028

Average of error rates: 0.02219012987012987

Total token count: 861286

In [49]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[49]:
[('e', 484),
 ('w', 475),
 ("'", 466),
 ('m', 342),
 ('t', 342),
 ('r', 310),
 ('d', 302),
 ('n', 300),
 ('f', 271),
 ('g', 254),
 ('th', 107),
 ('x', 76),
 ('co', 68),
 ('k', 67),
 ('u', 64),
 ('pa', 63),
 ('z', 63),
 ('io', 42),
 ('oc', 40),
 ('oo', 33),
 ('cc', 31),
 ('al', 21),
 ('q', 21),
 ('mt', 20),
 ('hausaland', 19),
 ('id', 19),
 ('ft', 19),
 ('stauffer', 19),
 ('zo', 18),
 ('basle', 18),
 ('mo', 18),
 ('couva', 17),
 ('kalaka', 17),
 ('hasegawa', 17),
 ('sul', 17),
 ('re', 17),
 ('okohira', 16),
 ('helsingfors', 15),
 ('pp', 15),
 ('sabbathschool', 15),
 ("hours'", 15),
 ('schwantes', 15),
 ('raiatea', 15),
 ('wm', 15),
 ('ro', 15),
 ('ioo', 14),
 ('seventhday', 14),
 ('ic', 14),
 ("''", 13),
 ('te', 13)]

Review Remaining Errors

In [50]:
reports.docs_with_high_error_rate(summary)
Out[50]:
[('TMM18980701-V10-07-page42.txt', 1.0),
 ('TMM18990201-V11-02-page10.txt', 0.9),
 ('TMM19000301-V12-03-page9.txt', 0.614),
 ('TMM18980101-V10-01-page4.txt', 0.605),
 ('TMM18991101-V11-11-page23.txt', 0.534),
 ('TMM18980301-V10-03-page25.txt', 0.517),
 ('TMM18990801-V11-08-page11.txt', 0.512),
 ('TMM18980401-V10-04-page4.txt', 0.5),
 ('TMM18990301-V11-03-page17.txt', 0.5),
 ('TMM18990701-V11-07-page10.txt', 0.5),
 ('TMM18990401-V11-04-page4.txt', 0.5),
 ('TMM18980301-V10-03-page6.txt', 0.449),
 ('TMM18980601-V10-06-page20.txt', 0.448),
 ('TMM19020301-V14-03-page2.txt', 0.363),
 ('TMM18980501-V10-05-page28.txt', 0.341),
 ('TMM18980301-V10-03-page18.txt', 0.333),
 ('TMM18980301-V10-03-page10.txt', 0.333),
 ('TMM18990501-V11-05-page48.txt', 0.333),
 ('TMM18990801-V11-08-page48.txt', 0.321),
 ('TMM19001001-V12-10-page1.txt', 0.317),
 ('TMM19000601-V12-06-page48.txt', 0.302),
 ('TMM19000101-V12-01-page10.txt', 0.291),
 ('TMM18990101-V11-01-page48.txt', 0.263),
 ('TMM18981201-V10-12-page20.txt', 0.25),
 ('TMM19000501-V12-05-page4.txt', 0.25),
 ('TMM18990601-V11-06-page48.txt', 0.25),
 ('TMM18980701-V10-07-page38.txt', 0.235),
 ('TMM18980201-V10-02-page13.txt', 0.219),
 ('TMM19020501-V14-05-page51.txt', 0.213)]
In [52]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.3]

utilities.open_original_docs(doc_keys, directories['cycle'])
Opened files: 

TMM18980701-V10-07-page42.txt

TMM18990201-V11-02-page10.txt

TMM19000301-V12-03-page9.txt

TMM18980101-V10-01-page4.txt

TMM18991101-V11-11-page23.txt

TMM18980301-V10-03-page25.txt

TMM18990801-V11-08-page11.txt

TMM18980401-V10-04-page4.txt

TMM18990301-V11-03-page17.txt

TMM18990701-V11-07-page10.txt

TMM18990401-V11-04-page4.txt

TMM18980301-V10-03-page6.txt

TMM18980601-V10-06-page20.txt

TMM19020301-V14-03-page2.txt

TMM18980501-V10-05-page28.txt

TMM18980301-V10-03-page18.txt

TMM18980301-V10-03-page10.txt

TMM18990501-V11-05-page48.txt

TMM18990801-V11-08-page48.txt

TMM19001001-V12-10-page1.txt

TMM19000601-V12-06-page48.txt

Most of the high error documents match the usual pattern of maps, images, and charts. One interesting exception is "TMM18980301-V10-03-page25.txt", which is in Spanish. I examined the original OCR and there were no accent marks that were lost during normalizing.

In [55]:
reports.long_errors(errors_summary, min_length=15)
Out[55]:
(['austria-hungaria',
  'scripture-sabbath',
  'gospel-commission',
  'elevatedrailroad',
  'newly-established',
  'darjeeling-above',
  'spanish-speaking',
  'soul-and-body-destroying',
  'into-insignificance',
  'heaven-descended',
  'hastily-organized',
  'greatgrandparents',
  "'globetrottings'",
  'hard-heartedness',
  'stivkimikarkaaagiaiwatkaaiiiikiiiikit',
  'scene-guadalajara',
  'self-commendation',
  'apoitleshipbeitring',
  'interestinebible',
  'fourteenyear-old',
  'mamouret-ul-aziz',
  'nezdterrerwiethe',
  'cigarette-papers',
  'pylrlitigeltrlile',
  'self-denyingfollowers',
  'joinherinherlabors',
  'artificially-made',
  "controversy'''among",
  'out-stations--one',
  'long-experienced',
  'intelligent-looking',
  'milkailkiiticimallikillitcattikilit',
  'spanish-american',
  'otherispanish-speaking',
  'charity-begins-at-home',
  'self-complacency',
  "looks'upon-their",
  'inexactconformity',
  'self-aggrandizement',
  'daughters-in-law',
  'fourthsabbathexercise',
  'sonderburg-glucksburg',
  'christianfarmers',
  'waterloo-jamaica',
  'thickly-timbered',
  'fire-worshippers',
  'frontispiece-mamma',
  'blood-corpuscles',
  'literally-fulfiled',
  'erichermerchantsintheusualway',
  'kaailikarkalikaii',
  'nrinfimparirlittlawawlit',
  'appropropriately',
  'innocent-looking',
  'vapolreidzeerrewniteh',
  'inspector-general',
  'chinese-japanese',
  'accountabilities',
  'tamtatikivnityleysa',
  'pricedinitrzements',
  'pleasure-seekers',
  "sabbath-keeper's",
  'lifrimmiiimenspirmiivinfillir',
  'governorgenerals',
  'ersacizaznovovar',
  'ereationaniagara',
  'trial--freeesendme',
  'printing-presses',
  'sasiiiiisnamiximinbegpeemnize',
  'avcitosivivocktickpeptv',
  'calvary-redeemed',
  'rifirmtairiiiitliww',
  'self-forgetfulness',
  'civilizationexists',
  'nieswaynorkadvocateofworld',
  'more-than-one-halffinger-long',
  'ronoliilgichinese',
  'buildingresembling',
  'ofwhose-pronunciation',
  'tlimmoutrlosillummultrm',
  'christ-followers',
  'orikakipkokyikartikawaavaikaaiiikaiiikaitio',
  'lengthandquality',
  'fircaraitisttiattkiisikaikaikatwatekattit',
  'sp-anish-speaking',
  'doppelschraubenpostdampfer',
  'panama-hat-is-aproduetion-of',
  'christianity-its',
  'narrow-mindedness',
  'ilhaillimillkillitilliiraliirailhallikakiiilkilitilikiiilki',
  'intienregstdxperience',
  'semi-independent',
  'sugar-plantation',
  'self-propagation',
  'buluwayo-zambesia',
  'swedish-speaking',
  'simple-mindedness',
  'anti-progressive',
  'rotilezdtftervtee',
  'christianizingthe',
  'frontispiece-harbor',
  'attentiontshould',
  'infaluableutenaill',
  'broad-shouldered',
  'pcmammariscloist',
  'italian-speaking',
  'health-restoring',
  'long-established',
  'sixteenprovinces',
  'learningsomething',
  'mievivimmeirinfirfa',
  'recently-developed',
  'non-commissioned',
  'church-fellowship',
  'english-japanese',
  "stauffer'szletter",
  'germanicthoroughlyy',
  'doppelsehraubenpostdampfer',
  'church-membership',
  'anidrgceontfaolutnsd',
  'japanese-english',
  'commandment-keeping',
  'well-proportioned',
  'nativity-interior',
  'french-switzerland',
  'powakikrilifwvfairarlit',
  'american-spanish',
  'ttttttttttttttttts',
  'frontispiece-thatched',
  'tfirinfargiiralt',
  'subscriptionsshould',
  'poorly-furnished',
  'self-established',
  'alexandria-troas',
  'irillillkillrallikillbilibilirrillralillkillullitilllikillp',
  'coffee-producing',
  'artistically-built',
  'boarding-schools',
  'fanning-machines',
  'amphitheatershaped',
  'wthetreyovuaneotrsierpyiyot',
  'kindergarten-school',
  'fellow-passenger',
  'gorgeously-arrayed',
  'generalconference',
  'hopelessicondition',
  'sabbath-breaking',
  'ratilikalattidir',
  'accommodation-houses',
  'fitthstliamascit',
  'earthquake-visited',
  'ositviivtablebbk',
  'interspersedhere',
  'brilliantly-lighted',
  'fellow-countrymen',
  'poverty-stricken',
  'ethical-political',
  'light-complexioned',
  'commander-in-chief',
  'chimney-blackened',
  'thrashing-machine',
  'tkilikarstatatamtitaiwookiiiikakotarkit',
  'governor-general',
  'anti-footbinding',
  'alexandria-troas--had',
  'firfinifwillirlrilliirilirlitilitalrerwlik'],
 15)
In [ ]: