Sligo-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "Sligo"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/baseline

Average verified rate: 0.9446963320935031

Average of error rates: 0.10142254901960783

Total token count: 294664

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 20 )
Out[11]:
[("'", 695),
 ('d', 692),
 ('m', 674),
 ('w', 660),
 ('-', 472),
 ('e', 367),
 ('n', 285),
 ('f', 239),
 ('r', 228),
 ('g', 227),
 ("-'", 226),
 ('ñ', 195),
 ('th', 177),
 ('¥', 165),
 ('t', 103),
 ('co', 99),
 ('con-', 85),
 (')', 84),
 ('tion', 83),
 ('re-', 65),
 ('ment', 59),
 ('in-', 55),
 ('(', 48),
 ('k', 44),
 ('col-', 42),
 ('be-', 41),
 ('=', 40),
 ('lege', 37),
 ('stu-', 37),
 ('sligon', 36),
 ('de-', 35),
 ('com-', 34),
 ('_', 34),
 ('ex-', 33),
 ('(continued', 32),
 ('ence', 31),
 ('schwab', 30),
 ('mis-', 29),
 ('u', 27),
 ('ñthe', 27),
 ('dis-', 26),
 ('numbers)', 24),
 ('pro-', 24),
 ('pa', 24),
 ('un-', 23),
 ('ber', 23),
 ('im-', 23),
 ('pre-', 22),
 ('mattingly', 22),
 ('editor-in-chief', 21),
 ('mt', 21)]

Review Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)
Out[12]:
[('ñ', 195),
 ('¥', 165),
 (')', 84),
 ('(', 48),
 ('=', 40),
 ('_', 34),
 ('(continued', 32),
 ('ñthe', 27),
 ('numbers)', 24),
 ('\\', 16),
 ('/', 14),
 ('ã', 12),
 ('(plain)', 12),
 ('%', 10),
 ('ñin', 10),
 ('ñelder', 9),
 ('ñand', 9),
 ('[', 9),
 ('£', 8),
 ('*', 8),
 ('ña', 8),
 ('(opposite', 7),
 ('¥-', 7),
 ('willard)', 7),
 ('ô', 6),
 ('¥¥', 6),
 ('ñone', 6),
 ('(formerly', 6),
 ('ñwhen', 6),
 ('+', 6),
 ('preferñwhether', 5),
 ('¥¥¥', 5),
 ('ñpresident', 5),
 ('`', 5),
 ('ñit', 5),
 ('(no', 5),
 ('ñnot', 4),
 ('-¥', 4),
 ('heñ', 4),
 ('workñcleaning', 4),
 ('addressñ', 4),
 ('ñthat', 4),
 ('ñan', 4),
 ('(class', 4),
 ('paperñyou', 4),
 ('ñto', 4),
 ('(midnight)', 4),
 ('ñwhat', 4),
 (']', 4),
 ('(poem)', 3),
 ('ec)', 3),
 ('ñdr', 3),
 ('¥t', 3),
 ('placeñ', 3),
 ('~', 3),
 ('ñwell', 3),
 ('¥¥¥¥', 3),
 ('(successors', 3),
 ('-=', 3),
 ('accessibleñ', 3),
 ('=¥', 3),
 ('net)', 3),
 ('(and', 3),
 ('ñasbury', 3),
 ('ñed', 3),
 ('(the', 3),
 ('cords)', 3),
 ("wavingñchildren's", 3),
 ('#', 3),
 ('^', 3),
 ('kidd)', 3),
 ('(not', 3),
 ('ñyes', 3),
 ('i)', 3),
 ('ñhow', 3),
 ('¥the', 2),
 ('ãã', 2),
 ('billsñ', 2),
 ('ñfrom', 2),
 ('eyeñan', 2),
 ('__', 2),
 ('(a)', 2),
 ('--_', 2),
 ('c)', 2),
 ('byñ', 2),
 ('ñthose', 2),
 ('ñdo', 2),
 ('different)', 2),
 ('>', 2),
 ('(b)', 2),
 ('¡', 2),
 ('(insist', 2),
 ('creationña', 2),
 ('(corner', 2),
 ('ñjohn', 2),
 ('terminalñleaving', 2),
 ('himñand', 2),
 ('ñprofessor', 2),
 ('label)', 2),
 ('¦', 2),
 ('ãããããããã', 2),
 ('ñhair', 2),
 ('needñthe', 2),
 ('moore)', 2),
 ('ñbecause', 2),
 ('*bop', 2),
 ('ããããããããã', 2),
 ("¥'", 2),
 ('ñclass', 2),
 ('ñyou', 2),
 ('(apologies', 2),
 ('problemsñthe', 2),
 ('the¥', 2),
 ('yearñ', 2),
 ("`no'", 2),
 ('lincolnñthe', 2),
 ('(incorporated)', 2),
 ('`well', 2),
 ('`do', 2),
 ('sunday_prom-', 2),
 ('¥¥¥¥¥', 2),
 ('ñstarting', 2),
 ("becauseñi'm", 2),
 ('ñsorenson', 2),
 ('ñi', 2),
 ('h)', 2),
 ('(every', 2),
 ('hatñand', 1),
 ('studyñprayerñwork', 1),
 ('aboveñtransitory', 1),
 ('`¥', 1),
 ('¥al', 1),
 ('ãshall', 1),
 ('`cheerful', 1),
 ('appearedñi', 1),
 ('rictservic_oe', 1),
 ('(florida)', 1),
 ('\\\\\\\\nn', 1),
 ('ñsmile', 1),
 ('vocabularyñ', 1),
 ('privilegeñ', 1),
 ('tut)', 1),
 ('expediencyñthe', 1),
 ('ñbeecher', 1),
 ('suppliesñhospital', 1),
 ('friedñfried', 1),
 ('i_', 1),
 ('dtiã', 1),
 ('speechesñin', 1),
 ('sarimmmowin¥', 1),
 ('avenue)', 1),
 ('ñcounsels', 1),
 ('alwaysñ', 1),
 ('ñpermanent', 1),
 ('be)', 1),
 ('`just', 1),
 ('banñ', 1),
 ('few)', 1),
 ('¥-¥-¥', 1),
 ('(mail', 1),
 ('oteã', 1),
 ('scienceñnone', 1),
 ("¡¡'", 1),
 ('exzeitsteasearzegaôentrznoreaca', 1),
 ('(preparatory)', 1),
 ('[e', 1),
 ('_a', 1),
 ('(n', 1),
 ('heartñto', 1),
 ('workersñall', 1),
 ('ñruskin', 1),
 ('partñand', 1),
 ('ñfor', 1),
 ('(this', 1),
 ('testñ', 1),
 ('ñthere', 1),
 ('habitñcultivate', 1),
 ('slidesñreal', 1),
 ('eika)l-', 1),
 ('itñreal', 1),
 ('**', 1),
 ('importantñthings', 1),
 ('--=', 1),
 ('days)', 1),
 ('ccosagzegoatteeztemewssms=', 1),
 ('girl)', 1),
 ('doorñ', 1),
 ('beautifulñfine', 1),
 ('gaspedñand', 1),
 ('michiganñsecretary-treasurer', 1),
 ('childñyet', 1),
 ('sightñ', 1),
 ('pportunityñresponsibility', 1),
 ('ñwhy', 1),
 ('=--', 1),
 ('ñhannah', 1),
 ('developedñalmost', 1),
 ('educationñnot', 1),
 ('women)', 1),
 ('worldñone', 1),
 ('e/ye', 1),
 ('ofñthe', 1),
 ('revelationñun-', 1),
 ('todayñtomorrow', 1),
 ('silberñcorder', 1),
 ('\\--/', 1),
 ('betterñ', 1),
 ('experienceña', 1),
 ('thinkingñthese', 1),
 ('vouchñ', 1),
 ('hopesñwilliam', 1),
 ('ó', 1),
 ('comesñright', 1),
 ('*eirmr', 1),
 ('courseñwashington', 1),
 ('ikii~r', 1),
 ('schoolñthat', 1),
 ('to¥', 1),
 ("csr's)c)mit", 1),
 ('ãflulduer', 1),
 ('(but', 1),
 ('ñread', 1),
 ('lr__j', 1),
 ('(ducation', 1),
 ('ragg=', 1),
 ('ñsome', 1),
 ('ñisa', 1),
 ('ñ_-', 1),
 ('greaterñ', 1),
 ('doñand', 1),
 ('actuallyñ', 1),
 ('-%', 1),
 ('passwordñover', 1),
 ('(vice-presi-', 1),
 ('`ffial', 1),
 ('ñannouncements', 1),
 ('learningñchristian', 1),
 ('neal)', 1),
 ('seasñif', 1),
 ('b¡', 1),
 ('butñ', 1),
 ('lossesñthose', 1),
 ('has/', 1),
 ('òrag', 1),
 ('ñhe', 1),
 ('god)', 1),
 ('estepñstephan', 1),
 ('ñboiled', 1),
 ('ambitionñeither', 1),
 ('leadersñstrong-hearted', 1),
 ('association)', 1),
 ('worldñto', 1),
 ('f--=', 1),
 ('(prolonged', 1),
 ('opportunityñhe', 1),
 ('ñwere', 1),
 ('courageñunsullied', 1),
 ("¥c'", 1),
 ('skyñpass', 1),
 ('whereñand', 1),
 ('painñ', 1),
 ('seventh-day¥', 1),
 ('i(', 1),
 ('pag¥', 1),
 ('ñdisposition', 1),
 ('ñsam', 1),
 ("lessonñdon't", 1),
 ('¦a', 1),
 ('j)', 1),
 ('vindicatedñ', 1),
 ('nationsñhear', 1),
 ('selfñsome', 1),
 ('allñand', 1),
 ('(that', 1),
 ('ñgeorgie', 1),
 ('(best', 1),
 ('estedñas', 1),
 ('erve>', 1),
 ('possessionñour', 1),
 ('leadersñjust', 1),
 ('-_-j', 1),
 ('<', 1),
 ('kindñmy', 1),
 ('triedñgod', 1),
 ('ñstrickland', 1),
 ('proofñproof', 1),
 ('i(kilposnep', 1),
 ('trailñnot', 1),
 ('ladiesñthe', 1),
 ('promiseñmendelssohn', 1),
 ('sell(', 1),
 ('((college', 1),
 ('musicñthe', 1),
 ('areñspeaking', 1),
 ('wantñ', 1),
 ('ratesñespecial/p', 1),
 ('womenñpower', 1),
 ('ñjames', 1),
 ('pennsylvaniañpresident', 1),
 ('ñhouse', 1),
 ('¡=', 1),
 ('moroseña', 1),
 ('studyñprayer', 1),
 ('instinctñsympathy', 1),
 ("'`", 1),
 ('americansñan', 1),
 ('ñthis', 1),
 ('charityñdespite', 1),
 ('happyñ', 1),
 ('(jr', 1),
 ('ñhappy', 1),
 ('bleña', 1),
 ('excitingñ', 1),
 ('ñsixty-four', 1),
 ('margin)', 1),
 ('topicñwhere', 1),
 ('manñunderstood', 1),
 ('_i', 1),
 ('educationñand', 1),
 ('stitutionñis', 1),
 ('civilizedñsome', 1),
 ('swishñand', 1),
 ('machineñwasting', 1),
 ('tripñordering', 1),
 ('ñdear', 1),
 ('virginia)', 1),
 ('priceñmaking', 1),
 ('rg-jñj', 1),
 ('ñlongfelldw', 1),
 ('dutyñthat', 1),
 ("^-'", 1),
 ("ñchrist's", 1),
 ('¡nip', 1),
 ('>yeaw', 1),
 ('/()', 1),
 ('sonñfor', 1),
 ('sacrificingñshall', 1),
 ('g-_-', 1),
 ('warningñtake', 1),
 ('twoñexternal', 1),
 ('(iiff', 1),
 ('giverñgive', 1),
 ('universeñthe', 1),
 ('badñ', 1),
 ('himñyes', 1),
 ('comeñthe', 1),
 ('ocem_at', 1),
 ('seañhe', 1),
 ('mwmareñ', 1),
 ('r-r_r', 1),
 ('herñ', 1),
 ('(illinois)', 1),
 ('floorñopp', 1),
 ('ñoliver', 1),
 ('ñmoody', 1),
 ('wereñi', 1),
 ('buyñor', 1),
 ('òthe', 1),
 ('ñout', 1),
 ('mosesñthe', 1),
 ('nmiki_a', 1),
 ('even*', 1),
 ('prepare=d', 1),
 ('(california)', 1),
 ('¦for', 1),
 ('al]', 1),
 ('make¥', 1),
 ('*tubents', 1),
 ('ordersñeven', 1),
 ('youñ', 1),
 ('hallñ', 1),
 ('religionsñbhuddism', 1),
 ('creaturesñwho', 1),
 ('talentsña', 1),
 ('rci)', 1),
 ('(in', 1),
 ('()', 1),
 ('forñ', 1),
 ('mile)', 1),
 ('heñthe', 1),
 ('workersñmen', 1),
 ('/w¥¥¥', 1),
 ('ñnorth', 1),
 ('(tenn', 1),
 ('reportñjust', 1),
 ('r**', 1),
 ('co-operationñteam', 1),
 ('forgetñlest', 1),
 ('f_-', 1),
 ('planña', 1),
 ('==immi=imeame==', 1),
 ("rl'_", 1),
 ('ñprof', 1),
 ('ñharriett', 1),
 ('ourselvesñ', 1),
 ('((', 1),
 ('ñwednesday', 1),
 ('glassñproxy', 1),
 ('il¥--', 1),
 ('foolishñso', 1),
 ('¥¥=', 1),
 ('--mmi¥mimimmin¥', 1),
 ('boneñ', 1),
 ('d[something', 1),
 ('¥i', 1),
 ('ñwas', 1),
 ('❑o', 1),
 ('mulesñsay', 1),
 ('endingñan', 1),
 ('illusionñsomething', 1),
 ('*theater', 1),
 ('travelersña', 1),
 ('w¥', 1),
 ('whileñour', 1),
 ('(sligo', 1),
 ('(`vv', 1),
 ('waysñ', 1),
 ('ñeven', 1),
 ('ãas', 1),
 ('¥him', 1),
 ('_-', 1),
 ('motherñem-', 1),
 ('¡i', 1),
 ('*rue', 1),
 ('ñmiss', 1),
 ('others)', 1),
 ('(hearty', 1),
 ('andñof', 1),
 ('-t_--', 1),
 ('ñfrances', 1),
 ('\ufeff', 1),
 ('_ligonian', 1),
 ('(including', 1),
 ('¥off', 1),
 ('spotñ', 1),
 ('[the', 1),
 ('cuttingñelectrical', 1),
 ('ñour', 1),
 ('parlorñand', 1),
 ('pulseñ', 1),
 ('typewriterñholder', 1),
 ('wow)', 1),
 ('`the', 1),
 ('(after', 1),
 ('wayñwhatever', 1),
 ('statedñand', 1),
 ('beganñ', 1),
 ('`higher', 1),
 ('¦if', 1),
 ('e]', 1),
 ('ingñit', 1),
 ('ñillustrated', 1),
 ('ars¥¥¥¥', 1),
 ('paperñ', 1),
 ('¥cr', 1),
 ('(he', 1),
 ('(tennessee)', 1),
 ('thi¥', 1),
 ('becauseñ', 1),
 ('ñafter', 1),
 ('failureñat', 1),
 ('ñaffirm', 1),
 ('factña', 1),
 ('anb)', 1),
 ('viewñmerely', 1),
 ('t=timi=', 1),
 ('(a', 1),
 ('ñshe', 1),
 ('timeñthat', 1),
 ('youñwhat', 1),
 ('***--', 1),
 ('thingsñ', 1),
 ('artñexhibited', 1),
 ('citle_', 1),
 ('headñone', 1),
 ('%iiiiiimili', 1),
 ('sightñall', 1),
 ('smilesñand', 1),
 ('/or', 1),
 ('(sanitarium', 1),
 ('ñadv', 1),
 ('lialialmea=', 1),
 ('ôj', 1),
 ('tionñgiant', 1),
 ('(oregon)', 1),
 ('page)-', 1),
 ('lelandñsanders', 1),
 ('¥-fv¥a', 1),
 ("*wva'alaraibill", 1),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥',
  1),
 ('ñjust', 1),
 ('observation)', 1),
 ('aloft¡', 1),
 ('=_', 1),
 ('li=', 1),
 ('//', 1),
 ('(colorado)', 1),
 ('peopleña', 1),
 ('attentionñ', 1),
 ('wilaxôrtagi/scv', 1),
 ('fireñthe', 1),
 ('¥cts', 1),
 ("-+'", 1),
 ('i_i', 1),
 ('laborsñfor', 1),
 ('(mass', 1),
 ('existenceña', 1),
 ('{c', 1),
 ('/oztr', 1),
 ('ñquiet', 1),
 ('ñd', 1),
 ('maryland)', 1),
 ("'¥¥", 1),
 ('c>c/p', 1),
 ('umbrellañor', 1),
 ("'ó'", 1),
 ('knowledgeñthis', 1),
 ('collegeñ', 1),
 ('gasñ', 1),
 ('ããããã', 1),
 ('amateurñ', 1),
 ('himmihihh¥', 1),
 ('ñhymns', 1),
 ('\\y', 1),
 ('_el', 1),
 ('wilsonñthe', 1),
 ('long¥', 1),
 ('salesñthe', 1),
 ('collegeña', 1),
 ('ôfor', 1),
 ('s¥le', 1),
 ('ñr', 1),
 ('packageñfor', 1),
 ('sabachthaniñmy', 1),
 ('hmmmw]w', 1),
 ('sawñi', 1),
 ('invitationñ', 1),
 ('horribleñi', 1),
 ('ѣ', 1),
 ('momentñgiven', 1),
 ('whoñof', 1),
 ('`professor', 1),
 ('neededña', 1),
 ('president¥machlan', 1),
 ('rw_oft', 1),
 ('thisñlike', 1),
 ('quartetteñmessrs', 1),
 ('rightñ', 1),
 ('(see', 1),
 ('%(wear', 1),
 ('hydrotherapyñwater', 1),
 ('itñbut', 1),
 ('{', 1),
 ('wordsñ', 1),
 ('(ipa', 1),
 ('phraseñ', 1),
 ('-*', 1),
 ('makersñ', 1),
 ('kneeñall', 1),
 ('mumps¥', 1),
 ('ñfebruary', 1),
 ('=-', 1),
 ('ñconsecration', 1),
 ('dayñas', 1),
 ('±lattrni', 1),
 ('eami=mt=', 1),
 ('ñst', 1),
 ('(low)', 1),
 ('fomentationsñwas', 1),
 ('washingtonñand', 1),
 ('toñ', 1),
 ('eatsñsandwiches', 1),
 ('---i-j-r_d', 1),
 ('ñtalents', 1),
 ('ñno', 1),
 ('et(', 1),
 ('herña', 1),
 ('namit)', 1),
 ('nothingña', 1),
 ('engineersñex-', 1),
 ('ñkern', 1),
 ('ñjosiah', 1),
 ('conesñthey', 1),
 ('oros¥pharmacists', 1),
 ('into)', 1),
 ('cetã', 1),
 ('eveningñyes', 1),
 ('freeñto', 1),
 ("o'erñ", 1),
 ('ñsomething', 1),
 ('scñz-dit', 1),
 ('achievementñ', 1),
 ('ãããããããããããããããããããããããããã', 1),
 ('(pest', 1),
 ('i[i', 1),
 ('eeimm/mmim', 1),
 ('=i=', 1),
 ('-_-=', 1),
 ('`studious', 1),
 ('effortsñgood', 1),
 ('marathi)', 1),
 ('words)', 1),
 ('bandñthey', 1),
 ('song-booksña', 1),
 ('educationñtoleration', 1),
 ('w¥-¥', 1),
 ("±'", 1),
 ('ñchopin', 1),
 ('))', 1),
 ('********', 1),
 ('ñcertainly', 1),
 ('ñcommencementñdegrees', 1),
 ('todayñ', 1),
 ('ñapologies', 1),
 ('doñshe', 1),
 ('depositingñmosquito', 1),
 ('forcesñcoal', 1),
 ('ô--/', 1),
 ('(apolo-', 1),
 ('thenñthe', 1),
 ('(first', 1),
 ('¥=', 1),
 ('idenñedwards', 1),
 ('tearsñ', 1),
 ('expressionñboth', 1),
 ('library)', 1),
 ('ñmen', 1),
 ('tubeñ', 1),
 ('unfulfilledñwork', 1),
 ('ñedith', 1),
 ('mr_', 1),
 ('ioo[off', 1),
 ('(shacks)', 1),
 ('conversationñwith', 1),
 ('wayñ', 1),
 ('-k¥-', 1),
 ('_=', 1),
 ('unselfishnessñthe', 1),
 ('e)', 1),
 ("'¥", 1),
 ('bookñthe', 1),
 ('ñadvt', 1),
 ('lifeñif', 1),
 ('ñbusiness', 1),
 ('dopeñ', 1),
 ('spiritñthat', 1),
 ('ñis', 1),
 ('¥^', 1),
 ('ñinland', 1),
 ('loyaltyñ', 1),
 ('resolvedñthat', 1),
 ('*words', 1),
 ('parsonageñin', 1),
 ('ñcanon', 1),
 ('audienceñit', 1),
 ('ñjunior-senior', 1),
 ('=a', 1),
 ('wilkinsonñbrown', 1),
 ('`lord', 1),
 ('meñ', 1),
 ('ii+j', 1),
 ('mr¥', 1),
 ('ization)', 1),
 ('usñand', 1),
 ('ff_p_acry', 1),
 ('ñwashington', 1),
 ('/ea', 1),
 ('cfã', 1),
 ('`she', 1),
 ('god_', 1),
 ('ãa', 1),
 ('(authorized', 1),
 ('con¥', 1),
 ('worldñprofessor', 1),
 ('ñdenton', 1),
 ('tourñcork', 1),
 ('r]', 1),
 ('menñeven', 1),
 ('>-', 1),
 ('whiteñall', 1),
 ("s¥e-'", 1),
 ('treeñ', 1),
 ('kindñ', 1),
 ('dresser)', 1),
 ('houseñphipps', 1),
 ('shullñstowe', 1),
 ('or%', 1),
 ('himñ', 1),
 ('ñstudent', 1),
 ('e=', 1),
 ('r¥~', 1),
 ('(sometimes', 1),
 ('(laughter', 1),
 ('[this', 1),
 ('(which', 1),
 ('}=', 1),
 ('tirm_', 1),
 ('~el', 1),
 ("('", 1),
 ('tt¥', 1),
 ('¥tams', 1),
 ('christianityña', 1),
 ('[our', 1),
 ('`innocuous', 1),
 ('courseñthe', 1),
 ('godñalthough', 1),
 ('physiqueñbut', 1),
 ('=momgmoments', 1),
 ('canvassingñthe', 1),
 ('treasureñthe', 1),
 ('onlyñfrom', 1),
 ('(for', 1),
 ('iñhave', 1),
 ('weñyou', 1),
 ('ingñin', 1),
 ('indifferentñmakes', 1),
 ('recite¥', 1),
 ('livesñ', 1),
 ('muchñsometimes', 1),
 ('*ay', 1),
 ('feet)', 1),
 ('days_', 1),
 ('t¥', 1),
 ('ñtwenty', 1),
 ('[skating', 1),
 ('checksñin', 1),
 ('ñwhile', 1),
 ('the¥lady', 1),
 ('r_', 1),
 ('¥duller', 1),
 ('=meeimme', 1),
 ('ãeat', 1),
 ('unwaveringñinto', 1),
 ('waterñby', 1),
 ('girlñshe', 1),
 ('out¥', 1),
 ('fieldñwhat', 1),
 ('ñor', 1),
 ('successña', 1),
 ('meantñfor', 1),
 ('werlineñgraham', 1),
 ('withñ', 1),
 ('ñvirginia', 1),
 ('elementñ', 1),
 ('guessñthey', 1),
 ('e¥-¥', 1),
 ('argumentñan', 1),
 ('dawn¥light', 1),
 ('r¥', 1),
 ('sermon*', 1),
 ('ãããããoããããããããããããã', 1),
 ('forty-flu¥', 1),
 ('aboutñthe', 1),
 ('timeñin', 1),
 ('o~', 1),
 ('ñlowell', 1),
 ('talkña', 1),
 ('ñreregistration', 1),
 ('heartña', 1),
 ('ããiii', 1),
 ('ñwith', 1),
 ('p©', 1),
 ('=ebbe', 1),
 ('faithña', 1),
 ('washboardñwhen', 1),
 ('¥nan', 1),
 ('blyb¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥', 1),
 ('~ijop', 1),
 ('locomotiveñthe', 1),
 ('occasionñmakes', 1),
 ('convenient)', 1),
 ('loveñi', 1),
 ('aboutñthey', 1),
 ('fie/', 1),
 ('clothesñsupporting', 1),
 ('¥fi', 1),
 ('worldñw', 1),
 ('ñausonius', 1),
 ('candiesñnorris', 1),
 ('nowhereñjust', 1),
 ('andñthey', 1),
 ('ñlet', 1),
 ('stillñ', 1),
 ('nothingñ', 1),
 ('manñabraham', 1),
 ('e_', 1),
 ('ñthey', 1),
 ('wordsñall', 1),
 ("`punk'", 1),
 ('valueñenduring', 1),
 ('ñbut', 1),
 ('ñfranklin', 1),
 ('walkedñout', 1),
 ('ñmrs', 1),
 ('inñ', 1),
 ('professionñor', 1),
 ('=dnb', 1),
 ('d[[every', 1),
 ('dl[the', 1),
 ('peakñabove', 1),
 ('cupñ', 1),
 ('wearyñto', 1),
 ('pat_', 1),
 ('vm¥', 1),
 ('there¥', 1),
 ('\\¥', 1),
 ('~~~', 1),
 ('ñnixon', 1),
 ('ñphysical', 1),
 ('ñso', 1),
 ('roadñ', 1),
 ('breakingñand', 1),
 ('*trrrt', 1),
 ('()lord', 1),
 ('(my', 1),
 ('classñmen', 1),
 ('thenñ', 1),
 (')f', 1),
 ('sanitariumña', 1),
 ('forty-twoñand', 1),
 ('ñby', 1),
 ('youñwhich', 1),
 ('sentinel_', 1),
 ('(wonlif', 1),
 ('headsñand', 1),
 ('ka-xap¥', 1),
 ('certsñwhen', 1),
 ('wordña', 1),
 ('breastñwhat', 1),
 ('`grandstand', 1),
 ('={=', 1),
 ('e}', 1),
 ('ñbaccalaureate', 1),
 ('sacrificeñyour', 1),
 ('moneyñ', 1),
 ('ñanon', 1),
 ('timeñat', 1),
 ('(concluded', 1),
 ('(concluded)', 1),
 ('ñnbt', 1),
 ('*¥¥¥¥mm¥¥¥¥¥¥¥', 1),
 ('to\\', 1),
 ('¥¥¥¥¥¥¥¥', 1),
 ('windñ', 1),
 ('serviceñ', 1),
 ('notñcan', 1),
 ('boneñand', 1),
 ('oneñedison', 1),
 ('shunñ', 1),
 ('ñjanette', 1),
 ('(c)', 1),
 ('futilityñis', 1),
 ('ñabove', 1),
 ('ñphillips', 1),
 ('[¡', 1),
 ('hereñthe', 1),
 ('ñvan', 1),
 ('th>-obb', 1),
 ('maccauleyñphilo', 1),
 ('o/', 1),
 ('ãããããããããããããffil', 1),
 ('(one', 1),
 (')-tri_jipa-r', 1),
 ('(judging', 1),
 ('appreciationñ', 1),
 ('shellsñat', 1),
 ('successful)', 1),
 ('¥*', 1),
 ('ssn(-', 1),
 ('meetingñwhat', 1),
 ('%-', 1),
 ('r=', 1),
 ('effectiveñdisposed', 1),
 ('ñsalisbury', 1),
 ('glorious¥', 1),
 ('onñthey', 1),
 ('ñjacob', 1),
 ('besnit¡', 1),
 ('ñmr', 1),
 ("\\\\\\''", 1),
 ('¥academy', 1),
 ('t-_-', 1),
 ('foolsñ', 1),
 ('apparentñ', 1),
 ('symposium)', 1),
 ('polishñpower', 1),
 ('e=e=}=f=', 1),
 ('allyflizeet)', 1),
 ('¥¥¥¥¥¥¥¥¥¥¥-¥', 1),
 ('gainñ', 1),
 ('dieña', 1),
 ('bloodñex-', 1),
 ('eightña', 1),
 ('willingnessñ', 1),
 ('`i`sisa', 1),
 ('andña', 1),
 ('el)', 1),
 ('grandmothersñthe', 1),
 ('¥--p', 1),
 ('e=iie', 1),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥-¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥-', 1),
 ('i\\', 1),
 ("'/e/", 1),
 ('=i', 1),
 ('+hfim', 1),
 ('%r¥', 1),
 ('kimballñ', 1),
 ('enjoyingñthat', 1),
 ('warñthat', 1),
 ('camñ', 1),
 ('(ed', 1),
 ("lord'sñlynes", 1),
 ('/lw', 1),
 ('(ott', 1),
 ('ãããããããããããããã', 1),
 ('educationñ', 1),
 ('(pardon', 1),
 ('industriesñthe', 1),
 ('=it=', 1),
 ('deliveranceñ', 1),
 ('oet/p', 1),
 ('weekña', 1),
 ('[j', 1),
 ('`had', 1),
 ('ñemerson', 1),
 ('dentifriceñ', 1),
 ("six-o'clock_dinner", 1),
 ('ñmendelssohn', 1),
 ('penñ', 1),
 ('delayñjoin', 1),
 ('ñnevertheless', 1),
 ('ñshould', 1),
 ('tellerña', 1),
 ('ñnews', 1),
 ('cite/natan', 1),
 ('ãããã', 1),
 ('ñmalaria', 1),
 ('speedñthe', 1),
 ('(fonserration', 1),
 ('¥test', 1),
 ('oa*', 1),
 ('handñall', 1),
 ('notñ', 1),
 ('seasonñthe', 1),
 ('sligoicai\\t', 1),
 ('typeñthe', 1),
 ('/g', 1),
 ('r¥r', 1),
 ('soundedñand', 1),
 ('(as', 1),
 ('ñalbertsworth', 1),
 ('ñhaving', 1),
 ('beforeñthat', 1),
 ('otemñiteñ', 1),
 ('yearñand', 1),
 ('bluff~ng', 1),
 ('memoryñcalm', 1),
 ('(invented', 1),
 ('missingñbelieved', 1),
 ('cee/nolt', 1),
 ('triedñto', 1),
 ('itñexactly', 1),
 ('¥nl', 1),
 ("mach]an's", 1),
 ('meritñthe', 1),
 ('runñlove', 1),
 ('/v', 1),
 ('standardñ', 1),
 ('prosaicñno', 1),
 ('===i', 1),
 ('`foolishness', 1),
 ('freely(', 1),
 ('zee/', 1),
 ('eagernessñto', 1),
 ('*ilatu', 1),
 ('sayñi', 1),
 ('ñwho', 1),
 ('_year', 1),
 ("¥'when", 1),
 ('folksñunderstand', 1),
 ('a¥', 1),
 ('salesman)', 1),
 ('ranñ', 1),
 ('(c', 1),
 ('mattersñhave', 1),
 ('}rrjr', 1)]

Correction 1 -- Normalize Characters

In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/correction1

Average verified rate: 0.9503606581884528

Average of error rates: 0.08799901960784313

Total token count: 294323

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[("'", 707),
 ('d', 697),
 ('m', 674),
 ('w', 665),
 ('-', 510),
 ('e', 379),
 ('n', 286),
 ('f', 242),
 ('r', 240),
 ('g', 228),
 ("-'", 227),
 ('th', 178),
 ('t', 109),
 ('co', 99),
 ('con-', 85),
 ('tion', 84),
 ('re-', 65),
 ('ment', 59),
 ('in-', 55),
 ('k', 44),
 ('col-', 42),
 ('be-', 41),
 ('stu-', 37),
 ('lege', 37),
 ('sligon', 36),
 ('de-', 35),
 ('ex-', 35),
 ('com-', 34),
 ('ence', 31),
 ('schwab', 30),
 ('mis-', 29),
 ('u', 27),
 ('dis-', 26),
 ('un-', 24),
 ('pa', 24),
 ('pro-', 24),
 ('ber', 23),
 ('--', 23),
 ('im-', 23),
 ('pre-', 22),
 ('mattingly', 22),
 ('mt', 22),
 ('editor-in-chief', 21),
 ('kuppenheimer', 20),
 ('mem-', 20),
 ('ful', 20),
 ('inter-', 20),
 ('en-', 20),
 ('ac-', 20),
 ('wash-', 20)]

Correction 2 -- Correct Line Endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/correction2

Average verified rate: 0.9661467546429712

Average of error rates: 0.07646568627450981

Total token count: 291139

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[("'", 707),
 ('d', 697),
 ('m', 673),
 ('w', 665),
 ('-', 504),
 ('e', 379),
 ('n', 286),
 ('f', 241),
 ('r', 238),
 ('g', 228),
 ("-'", 227),
 ('th', 178),
 ('t', 109),
 ('co', 99),
 ('k', 44),
 ('sligon', 36),
 ('schwab', 30),
 ('u', 27),
 ('pa', 24),
 ('--', 23),
 ('mt', 22),
 ('mattingly', 22),
 ('editor-in-chief', 21),
 ('kuppenheimer', 20),
 ("painters'", 19),
 ('va', 19),
 ('ph', 17),
 ('-minute', 16),
 ('lippart', 14),
 ('kamoda', 14),
 ('styleplus', 14),
 ('gonian', 14),
 ('dietel', 13),
 ('rebok', 13),
 ('house-furnishings', 13),
 ('herzog', 13),
 ('flather', 12),
 ('kimonas', 12),
 ('z', 12),
 ('ahrens', 11),
 ('chesnutt', 11),
 ('ailes', 11),
 ('friedlander', 11),
 ("cardia's", 11),
 ('ei', 11)]

Correction 3 -- Remove Extra Dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
Sligo19160401-V01-01-page20.txt: [('WASH-', 'WASH'), ('bright-', 'bright')]
Sligo19160401-V01-01-page23.txt: [('-', '')]
Sligo19160401-V01-01-page27.txt: [('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-', '')]
Sligo19160401-V01-01-page7.txt: [('-.', '.'), ('F--', 'F-'), ('--', '-'), ('a-', 'a')]
Sligo19160501-VXX-XX-page14.txt: [('-', '')]
Sligo19160501-VXX-XX-page16.txt: [('-J', 'J'), ('-.', '.'), ("d'T-", "d'T"), ('-', ''), ('-', '')]
Sligo19160501-VXX-XX-page17.txt: [("-'", "'")]
Sligo19160501-VXX-XX-page18.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19160501-VXX-XX-page25.txt: [('-', ''), ("-.'re", ".'re"), ('-t-.', 't-.'), ('-', ''), ('-', '')]
Sligo19160501-VXX-XX-page27.txt: [("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'")]
Sligo19160501-VXX-XX-page28.txt: [("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', '')]
Sligo19160501-VXX-XX-page30.txt: [('-.', '.'), ('-', '')]
Sligo19160501-VXX-XX-page32.txt: [("-'", "'"), ("-'", "'"), ('SLIGON-', 'SLIGON'), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'")]
Sligo19160501-VXX-XX-page34.txt: [('-', ''), ('-', ''), ('S."-', 'S."'), ('-', ''), ('--', '-'), ('-', ''), ('\'oki.".-', '\'oki.".'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ("-..o'i.", "..o'i."), ('-', ''), ('-"s', '"s'), ('-.', '.'), ('-', ''), ('-', '')]
Sligo19160501-VXX-XX-page38.txt: [('-', '')]
Sligo19160501-VXX-XX-page44.txt: [('P-', 'P'), ('-', ''), ('-', '')]
Sligo19160501-VXX-XX-page5.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19160501-VXX-XX-page66.txt: [('-', ''), ('--P', '-P'), ('-', '')]
Sligo19160501-VXX-XX-page67.txt: [('hav-', 'hav')]
Sligo19160501-VXX-XX-page69.txt: [('repre-', 'repre')]
Sligo19160501-VXX-XX-page74.txt: [('WASH-', 'WASH'), ('ingrow-', 'ingrow')]
Sligo19160501-VXX-XX-page76.txt: [('-', ''), ('-', ''), ('.--', '.-'), ('-', '')]
Sligo19160501-VXX-XX-page78.txt: [('-Rr', 'Rr'), ('-', ''), ('--a', '-a'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-I-', 'I-'), ('-', ''), ('Jr-', 'Jr'), ('-', ''), ('-fv', 'fv'), ('--', '-')]
Sligo19160501-VXX-XX-page79.txt: [('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-minute', 'minute'), ('-', '')]
Sligo19160501-VXX-XX-page8.txt: [('-a', 'a')]
Sligo19160501-VXX-XX-page82.txt: [('-', '')]
Sligo19160501-VXX-XX-page83.txt: [('-dt', 'dt')]
Sligo19160501-VXX-XX-page86.txt: [('-', ''), ("'-", "'"), ('-', ''), ('P--', 'P-'), ('--', '-'), ('-', ''), ('----', '---'), ('--I', '-I')]
Sligo19160501-VXX-XX-page87.txt: [('-', ''), ('-', '')]
Sligo19160501-VXX-XX-page88.txt: [('--tp.', '-tp.'), ('-ww', 'ww')]
Sligo19160901-V01-03,04-page21.txt: [('Vice-', 'Vice')]
Sligo19160901-V01-03,04-page24.txt: [('peace-', 'peace')]
Sligo19160901-V01-03,04-page27.txt: [('Lam-', 'Lam')]
Sligo19160901-V01-03,04-page30.txt: [('---', '--')]
Sligo19160901-V01-03,04-page31.txt: [('PRESS-', 'PRESS'), ('Cor-', 'Cor')]
Sligo19160901-V01-03,04-page32.txt: [('-', ''), ('CHIL-', 'CHIL')]
Sligo19160901-V01-03,04-page33.txt: [('-', '')]
Sligo19160901-V01-03,04-page34.txt: [('--', '-'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19160901-V01-03,04-page35.txt: [('-', ''), ('-', '')]
Sligo19160901-V01-03,04-page5.txt: [('-', '')]
Sligo19160901-V01-03,04-page7.txt: [('-g.', 'g.')]
Sligo19161101-V01-05-page23.txt: [('WASH-', 'WASH')]
Sligo19161101-V01-05-page28.txt: [('PRESS-', 'PRESS')]
Sligo19161101-V01-05-page30.txt: [('-', ''), ('CHIL-', 'CHIL'), ('-e-', 'e-'), ('-', '')]
Sligo19161101-V01-05-page32.txt: [('-', ''), ('-', '')]
Sligo19161101-V01-05-page33.txt: [('-', '')]
Sligo19161201-V01-06-page14.txt: [('WASH-', 'WASH')]
Sligo19161201-V01-06-page18.txt: [('PRESS-', 'PRESS')]
Sligo19161201-V01-06-page19.txt: [('-', '')]
Sligo19161201-V01-06-page20.txt: [('-', ''), ('-', '')]
Sligo19161201-V01-06-page21.txt: [('-', ''), ('-Ai', 'Ai')]
Sligo19161201-V01-06-page22.txt: [('-', '')]
Sligo19161201-V01-06-page7.txt: [('hav-', 'hav')]
Sligo19170101-V01-07-page18.txt: [('-', ''), ('-', '')]
Sligo19170101-V01-07-page2.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19170101-V01-07-page24.txt: [('-', ''), ('PRESS-', 'PRESS')]
Sligo19170101-V01-07-page25.txt: [('-', '')]
Sligo19170101-V01-07-page26.txt: [('-', ''), ('-', '')]
Sligo19170101-V01-07-page27.txt: [('-', '')]
Sligo19170101-V01-07-page4.txt: [('-', ''), ('-', ''), ('-.N.VM', '.N.VM')]
Sligo19170201-V01-08-page12.txt: [('out-', 'out')]
Sligo19170201-V01-08-page2.txt: [('-', '')]
Sligo19170201-V01-08-page23.txt: [('-', '')]
Sligo19170201-V01-08-page24.txt: [('-T', 'T'), ('--', '-'), ('g-', 'g'), ('-', ''), ('-', ''), ('g-', 'g')]
Sligo19170201-V01-08-page30.txt: [('---dcLut', '--dcLut')]
Sligo19170201-V01-08-page32.txt: [('-', ''), ('-', '')]
Sligo19170201-V01-08-page33.txt: [('-', '')]
Sligo19170301-V01-09-page2.txt: [('---', '--'), ('PRESS-', 'PRESS'), ('ALTER-', 'ALTER')]
Sligo19170301-V01-09-page31.txt: [('-', '')]
Sligo19170301-V01-09-page33.txt: [('V.I.-', 'V.I.')]
Sligo19170301-V01-09-page34.txt: [('-', ''), ('-', '')]
Sligo19170301-V01-09-page35.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19170301-V01-09-page36.txt: [('-', ''), ('-J', 'J')]
Sligo19170301-V01-09-page7.txt: [('SLI-', 'SLI')]
Sligo19170501-V02-02-page14.txt: [('-', ''), ('-', '')]
Sligo19170501-V02-02-page16.txt: [('farm-', 'farm')]
Sligo19170501-V02-02-page19.txt: [("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'")]
Sligo19170501-V02-02-page23.txt: [("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('Danish-', 'Danish'), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'")]
Sligo19170501-V02-02-page27.txt: [("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'")]
Sligo19170501-V02-02-page31.txt: [("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'")]
Sligo19170501-V02-02-page36.txt: [("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'"), ("-'", "'"), ('-', ''), ("-'", "'")]
Sligo19170501-V02-02-page42.txt: [('-he', 'he')]
Sligo19170501-V02-02-page53.txt: [('-', '')]
Sligo19170501-V02-02-page57.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19170501-V02-02-page58.txt: [('-', '')]
Sligo19170501-V02-02-page59.txt: [('--ociagElvase-', '-ociagElvase-'), ('-', '')]
Sligo19170501-V02-02-page6.txt: [('-', '')]
Sligo19170501-V02-02-page61.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19170501-V02-02-page62.txt: [('-Pres.', 'Pres.')]
Sligo19170501-V02-02-page63.txt: [('-', '')]
Sligo19170501-V02-02-page66.txt: [('-', '')]
Sligo19170901-V02-03,04-page10.txt: [('SANDAKER-', 'SANDAKER')]
Sligo19170901-V02-03,04-page15.txt: [('an-', 'an')]
Sligo19170901-V02-03,04-page17.txt: [('-does', 'does')]
Sligo19170901-V02-03,04-page2.txt: [('-', ''), ('-', '')]
Sligo19170901-V02-03,04-page27.txt: [('Vai-', 'Vai')]
Sligo19170901-V02-03,04-page29.txt: [('Cor-', 'Cor'), ('iM-', 'iM')]
Sligo19170901-V02-03,04-page3.txt: [('-', '')]
Sligo19170901-V02-03,04-page30.txt: [('-', '')]
Sligo19170901-V02-03,04-page31.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19170901-V02-03,04-page32.txt: [('-', '')]
Sligo19170901-V02-03,04-page9.txt: [('HERR-', 'HERR')]
Sligo19180501-V03-02-page13.txt: [('rabuitirs-', 'rabuitirs')]
Sligo19180501-V03-02-page15.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page16.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page17.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page18.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page19.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page20.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page21.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page22.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page23.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page24.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page25.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('--Beethoven', '-Beethoven'), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page26.txt: [('-', ''), ('-', '')]
Sligo19180501-V03-02-page27.txt: [('-', '')]
Sligo19180501-V03-02-page29.txt: [('BECK-', 'BECK')]
Sligo19180501-V03-02-page30.txt: [('WILL-', 'WILL')]
Sligo19180501-V03-02-page53.txt: [('nub-', 'nub')]
Sligo19180501-V03-02-page56.txt: [('-', ''), ('-', ''), ('Labratoy-', 'Labratoy')]
Sligo19180501-V03-02-page63.txt: [('-volume', 'volume'), ('-volume', 'volume')]
Sligo19180501-V03-02-page65.txt: [('It-', 'It')]
Sligo19180501-V03-02-page67.txt: [('-day', 'day')]
Sligo19180501-V03-02-page68.txt: [('-', '')]
Sligo19180501-V03-02-page74.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19180501-V03-02-page77.txt: [('Md.-', 'Md.')]
Sligo19181101-V03-04-page13.txt: [('al-', 'al')]
Sligo19181101-V03-04-page14.txt: [('-', ''), ('-', ''), ('...--', '...-')]
Sligo19181101-V03-04-page2.txt: [('SLI-', 'SLI')]
Sligo19181101-V03-04-page23.txt: [('-', '')]
Sligo19181101-V03-04-page27.txt: [('RALS-', 'RALS')]
Sligo19181101-V03-04-page3.txt: [('A-', 'A'), ('N-', 'N'), ('-', ''), ('-.', '.'), ('-', ''), ('-', '')]
Sligo19181101-V03-04-page31.txt: [('-', '')]
Sligo19181101-V03-04-page33.txt: [('-', ''), ('-', '')]
Sligo19181101-V03-04-page35.txt: [('-', '')]
Sligo19181101-V03-04-page4.txt: [('-r', 'r'), ('offer-', 'offer'), ('-', ''), ('-to', 'to'), ('-obb', 'obb'), ('.s-', '.s'), ('-nearness', 'nearness'), ('-rt.', 'rt.'), ('-netr', 'netr'), ('a-', 'a'), ('-Yrte.', 'Yrte.'), ('FA-', 'FA'), ('for-', 'for')]
Sligo19181101-V03-04-page5.txt: [('-AN', 'AN')]
Sligo19181101-V03-04-page6.txt: [('-', '')]
Sligo19181201-V03-05-page10.txt: [('..ctasimmem-', '..ctasimmem')]
Sligo19181201-V03-05-page2.txt: [('-', '')]
Sligo19181201-V03-05-page29.txt: [('-', '')]
Sligo19181201-V03-05-page30.txt: [('-', '')]
Sligo19181201-V03-05-page31.txt: [('-', '')]
Sligo19181201-V03-05-page34.txt: [('RALS-', 'RALS')]
Sligo19181201-V03-05-page35.txt: [('SLI-', 'SLI')]
Sligo19181201-V03-05-page5.txt: [('think-', 'think')]
Sligo19181201-V03-05-page8.txt: [('"---', '"--'), ("-'", "'"), ('-', ''), ('-...', '...'), ('---', '--'), ('r---', 'r--'), ('..."--', '..."-'), ('-', ''), ('-"\'', '"\'')]
Sligo19190201-V03-05-page11.txt: [('in-', 'in')]
Sligo19190201-V03-05-page14.txt: [('-', '')]
Sligo19190201-V03-05-page15.txt: [('-', '')]
Sligo19190201-V03-05-page17.txt: [('-sleepy', 'sleepy')]
Sligo19190201-V03-05-page2.txt: [('-', '')]
Sligo19190201-V03-05-page25.txt: [('-Gibson', 'Gibson'), ('Christ-', 'Christ')]
Sligo19190201-V03-05-page28.txt: [('-', '')]
Sligo19190201-V03-05-page29.txt: [('-', ''), ('-', '')]
Sligo19190201-V03-05-page31.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19190201-V03-05-page34.txt: [('RALS-', 'RALS')]
Sligo19190201-V03-05-page4.txt: [('-', '')]
Sligo19190201-V03-05-page5.txt: [('-VC', 'VC')]
Sligo19190201-V03-05-page8.txt: [('-', ''), ('-', ''), ('-', ''), ('-...', '...'), ('.-', '.'), ('-', ''), ('-.V', '.V'), ('--', '-'), ('-', ''), ('...-', '...'), ("..'-", "..'"), ('..--', '..-'), ('-', ''), ('....---', '....--'), ('--', '-'), ('-', ''), ('....----', '....---'), ('--', '-'), ('-', '')]
Sligo19190201-V03-06-page11.txt: [('in-', 'in')]
Sligo19190201-V03-06-page13.txt: [('-"', '"')]
Sligo19190201-V03-06-page14.txt: [('-', '')]
Sligo19190201-V03-06-page2.txt: [('-', '')]
Sligo19190201-V03-06-page25.txt: [('-the', 'the'), ('Christ-', 'Christ')]
Sligo19190201-V03-06-page28.txt: [('-', '')]
Sligo19190201-V03-06-page29.txt: [('-', ''), ('-', '')]
Sligo19190201-V03-06-page30.txt: [('R-', 'R')]
Sligo19190201-V03-06-page31.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19190201-V03-06-page34.txt: [('RALS-', 'RALS')]
Sligo19190201-V03-06-page4.txt: [('-', '')]
Sligo19190201-V03-06-page5.txt: [('-', ''), ('-.', '.')]
Sligo19190201-V03-06-page8.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19190201-V04-05-page20.txt: [('revive.-', 'revive.')]
Sligo19190201-V04-05-page27.txt: [('--mmi', '-mmi'), ('Oak-LAI-', 'Oak-LAI')]
Sligo19190201-V04-05-page29.txt: [('Chapters-', 'Chapters'), ('Pages-', 'Pages')]
Sligo19190201-V04-05-page3.txt: [('---', '--')]
Sligo19190201-V04-05-page30.txt: [('way.-', 'way.')]
Sligo19190201-V04-05-page31.txt: [('-', '')]
Sligo19190201-V04-05-page33.txt: [('PROM-', 'PROM'), ('Hours-', 'Hours'), ('Sunday-', 'Sunday'), ('-', '')]
Sligo19190201-V04-05-page35.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', '')]
Sligo19190201-V04-05-page36.txt: [('-', '')]
Sligo19190201-V04-05-page5.txt: [('-The', 'The')]
Sligo19190301-V03-07-page1.txt: [('-', ''), ('DE-', 'DE'), ('....-', '....')]
Sligo19190301-V03-07-page10.txt: [('-', '')]
Sligo19190301-V03-07-page12.txt: [('-', '')]
Sligo19190301-V03-07-page15.txt: [('lec-', 'lec')]
Sligo19190301-V03-07-page19.txt: [('-', ''), ('-Xi', 'Xi'), ('-', ''), ('-', '')]
Sligo19190301-V03-07-page20.txt: [('work.-', 'work.')]
Sligo19190301-V03-07-page23.txt: [('--', '-'), ('--', '-')]
Sligo19190301-V03-07-page28.txt: [('C.-', 'C.')]
Sligo19190301-V03-07-page30.txt: [('-half', 'half')]
Sligo19190301-V03-07-page32.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19190301-V03-07-page36.txt: [('e-', 'e'), ('-', ''), ('-', '')]
Sligo19190301-V03-07-page41.txt: [('-', '')]
Sligo19190301-V03-07-page43.txt: [('-', '')]
Sligo19190301-V03-07-page45.txt: [('-', '')]
Sligo19190301-V03-07-page47.txt: [('-', '')]
Sligo19190301-V03-07-page48.txt: [('RALS-', 'RALS')]
Sligo19190301-V03-07-page5.txt: [('en-', 'en')]
Sligo19190301-V03-07-page51.txt: [('sc-t-', 'sc-t')]
Sligo19190401-V03-08-page14.txt: [('Hence-', 'Hence')]
Sligo19190401-V03-08-page15.txt: [('"We-', '"We'), ('-', '')]
Sligo19190401-V03-08-page18.txt: [('sum-', 'sum')]
Sligo19190401-V03-08-page2.txt: [('-', ''), ('-', ''), ('-', '')]
Sligo19190401-V03-08-page23.txt: [('-', ''), ('.-', '.'), ('-', ''), ('---', '--'), ("'---", "'--"), ('experi-', 'experi'), ('---', '--')]
Sligo19190401-V03-08-page27.txt: [('A-', 'A')]
Sligo19190401-V03-08-page28.txt: [('-W', 'W')]
Sligo19190401-V03-08-page29.txt: [('cr-errt-d.e.a.-', 'cr-errt-d.e.a.')]
Sligo19190401-V03-08-page30.txt: [('-', '')]
Sligo19190401-V03-08-page32.txt: [('-', '')]
Sligo19190401-V03-08-page34.txt: [('RALS-', 'RALS')]
Sligo19190401-V04-06-page11.txt: [('arrest-', 'arrest')]
Sligo19190401-V04-06-page17.txt: [('-', '')]
Sligo19190401-V04-06-page24.txt: [('-coming', 'coming')]
Sligo19190401-V04-06-page25.txt: [('-', '')]
Sligo19190401-V04-06-page26.txt: [('-dfelt', 'dfelt')]
Sligo19190401-V04-06-page30.txt: [('-', '')]
Sligo19190401-V04-06-page31.txt: [('-', '')]
Sligo19190401-V04-06-page33.txt: [('PROM-', 'PROM'), ('EVENING-', 'EVENING'), ('Hours-', 'Hours'), ('-', ''), ('Sunday-', 'Sunday'), ('-', '')]
Sligo19190401-V04-06-page35.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19190401-V04-06-page36.txt: [('Y-', 'Y'), ('-"\'', '"\''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19190401-V04-07-page17.txt: [('-Iis', 'Iis')]
Sligo19190401-V04-07-page24.txt: [('Mt-', 'Mt')]
Sligo19190401-V04-07-page29.txt: [('-.', '.'), ('t-', 't'), ('-', ''), ('-', ''), ('-', ''), ('V-', 'V')]
Sligo19190401-V04-07-page31.txt: [('-', '')]
Sligo19190401-V04-07-page33.txt: [('PROM-', 'PROM'), ('Hours-', 'Hours'), ('Sunday-', 'Sunday')]
Sligo19190401-V04-07-page35.txt: [('-', ''), ('-', ''), ('-', ''), ('--', '-'), ('-', '')]
Sligo19190401-V04-07-page36.txt: [('-', '')]
Sligo19190401-V04-07-page4.txt: [('-', '')]
Sligo19190401-V04-07-page7.txt: [('atten-', 'atten')]
Sligo19191001-V04-01-page1.txt: [('-', ''), ('-', ''), ('--', '-'), ('----', '---'), ('-', ''), ('-', ''), ('------', '-----'), ('-', ''), ('--', '-'), ('-k', 'k'), ('-', ''), ('...-', '...'), ('--', '-'), ('".-', '".')]
Sligo19191001-V04-01-page10.txt: [('the-', 'the')]
Sligo19191001-V04-01-page13.txt: [('-Tri', 'Tri')]
Sligo19191001-V04-01-page15.txt: [('ros-', 'ros')]
Sligo19191001-V04-01-page16.txt: [('non-', 'non')]
Sligo19191001-V04-01-page2.txt: [('--', '-'), ('-', ''), ('accessible--', 'accessible-')]
Sligo19191001-V04-01-page22.txt: [('-Jose', 'Jose'), ("-'", "'")]
Sligo19191001-V04-01-page25.txt: [('-', ''), ('-', '')]
Sligo19191001-V04-01-page28.txt: [('DIAG-', 'DIAG')]
Sligo19191001-V04-01-page29.txt: [('-', ''), ('-', '')]
Sligo19191001-V04-01-page33.txt: [('PRE-', 'PRE')]
Sligo19191001-V04-01-page34.txt: [('EX-', 'EX'), ('WASHINGTON-', 'WASHINGTON')]
Sligo19191001-V04-01-page35.txt: [('-', ''), ('-', '')]
Sligo19191001-V04-01-page7.txt: [('suspir-', 'suspir'), ('-', ''), ('-', ''), ('--a', '-a')]
Sligo19191001-V04-01-page8.txt: [('-are', 'are')]
Sligo19191101-V04-02-page1.txt: [('-', '')]
Sligo19191101-V04-02-page11.txt: [('conse-', 'conse')]
Sligo19191101-V04-02-page16.txt: [('SLI-', 'SLI'), ('Class-of-', 'Class-of')]
Sligo19191101-V04-02-page17.txt: [('of-', 'of')]
Sligo19191101-V04-02-page18.txt: [('Secretary-', 'Secretary'), ('regulai-', 'regulai')]
Sligo19191101-V04-02-page23.txt: [('-', ''), ("Sevent'-", "Sevent'"), ('-day', 'day'), ('-', ''), ('Mid-', 'Mid')]
Sligo19191101-V04-02-page24.txt: [('-SSRS.', 'SSRS.')]
Sligo19191101-V04-02-page28.txt: [('-', '')]
Sligo19191101-V04-02-page3.txt: [('-', '')]
Sligo19191101-V04-02-page30.txt: [('-MK', 'MK')]
Sligo19191101-V04-02-page34.txt: [('EX-', 'EX')]
Sligo19191101-V04-02-page35.txt: [('-', ''), ('-', ''), ('-', ''), ('-', '')]
Sligo19191101-V04-02-page7.txt: [('-', '')]
Sligo19191101-V04-02-page8.txt: [('per-', 'per')]
Sligo19201001-V05-01-page12.txt: [('r-rJ-', 'r-rJ'), ('-', ''), ('-', ''), ('-', ''), ('FJ-', 'FJ'), ('-a-I', 'a-I')]
Sligo19201001-V05-01-page15.txt: [('r-', 'r')]
Sligo19201001-V05-01-page16.txt: [('-', ''), ('C.-', 'C.'), ('-', '')]
Sligo19201001-V05-01-page2.txt: [('-', '')]
Sligo19201001-V05-01-page20.txt: [('L--', 'L-'), ('-', '')]
Sligo19201001-V05-01-page25.txt: [('sur-', 'sur')]
Sligo19201001-V05-01-page27.txt: [('doc-', 'doc')]
Sligo19201001-V05-01-page3.txt: [('-', '')]
Sligo19201001-V05-01-page31.txt: [('-', ''), ('--EitZeA', '-EitZeA')]
Sligo19201001-V05-01-page34.txt: [('-Fireside"', 'Fireside"'), ('-', '')]
Sligo19201001-V05-01-page36.txt: [('L-', 'L')]
Sligo19201001-V05-01-page4.txt: [('C.-', 'C.')]
Sligo19201001-V05-01-page7.txt: [('-', ''), ('-', ''), ('-', ''), ('Su-', 'Su'), ('-', '')]
Sligo19201001-V05-01-page8.txt: [('mis-', 'mis'), ('-I', 'I'), ('-A', 'A')]
Sligo19201101-V05-02-page1.txt: [('THE-', 'THE')]
Sligo19201101-V05-02-page10.txt: [('be-', 'be')]
Sligo19201101-V05-02-page14.txt: [('al-', 'al')]
Sligo19201101-V05-02-page16.txt: [('---I-J-r', '--I-J-r')]
Sligo19201101-V05-02-page17.txt: [('ap-', 'ap')]
Sligo19201101-V05-02-page19.txt: [('-men', 'men')]
Sligo19201101-V05-02-page20.txt: [('-', ''), ('-', ''), ('VIEW-', 'VIEW')]
Sligo19201101-V05-02-page29.txt: [('-', '')]
Sligo19201101-V05-02-page31.txt: [('-', ''), ('-', '')]
Sligo19201101-V05-02-page34.txt: [('-', ''), ('-', ''), ('-', ''), ('-W', 'W'), ('-', ''), ('-', '')]
Sligo19201101-V05-02-page36.txt: [('-', '')]
Sligo19201101-V05-02-page5.txt: [('-spirit', 'spirit')]
Sligo19201101-V05-02-page6.txt: [('col-', 'col')]
Sligo19201101-V05-02-page8.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-r', 'r')]
Sligo19201101-V05-02-page9.txt: [('en-', 'en')]
Sligo19201201-V05-03-page12.txt: [('responsi-', 'responsi')]
Sligo19201201-V05-03-page13.txt: [('-r', 'r'), ('-', ''), ('-.', '.'), ('-', ''), ('--', '-')]
Sligo19201201-V05-03-page16.txt: [('insti-', 'insti')]
Sligo19201201-V05-03-page17.txt: [('prac-', 'prac')]
Sligo19201201-V05-03-page28.txt: [('-J', 'J')]
Sligo19201201-V05-03-page29.txt: [('--', '-')]
Sligo19201201-V05-03-page31.txt: [('-', '')]
Sligo19201201-V05-03-page34.txt: [('-', ''), ('-', ''), ('-', ''), ('-', ''), ('-W', 'W'), ('-', ''), ('-', '')]
Sligo19201201-V05-03-page36.txt: [('-', ''), ('-E', 'E')]
Sligo19201201-V05-03-page5.txt: [('com-', 'com')]
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/correction3

Average verified rate: 0.9685648560454713

Average of error rates: 0.07120882352941177

Total token count: 290821

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[("'", 938),
 ('d', 700),
 ('m', 673),
 ('w', 668),
 ('e', 390),
 ('n', 291),
 ('r', 252),
 ('f', 244),
 ('g', 234),
 ('th', 178),
 ('t', 114),
 ('co', 105),
 ('k', 45),
 ('sligon', 37),
 ('schwab', 30),
 ('u', 27),
 ('pa', 24),
 ('mt', 23),
 ('mattingly', 22),
 ('kuppenheimer', 20),
 ("painters'", 19),
 ('editor-in-chief', 19),
 ('va', 19),
 ('ph', 17),
 ('x', 15),
 ('gonian', 14),
 ('lippart', 14),
 ('kamoda', 14),
 ('styleplus', 14),
 ('dietel', 13),
 ('z', 13),
 ('rebok', 13),
 ('herzog', 13),
 ('flather', 12),
 ('kimonas', 12),
 ('ahrens', 11),
 ('friedlander', 11),
 ('ailes', 11),
 ('ei', 11),
 ("cardia's", 11),
 ('chesnutt', 11)]

Correction 4 -- Remove extra quotation marks

In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
        print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
Sligo19160401-V01-01-page6.txt: [("'tween", 'tween'), ("'round", 'round')]
Sligo19160501-VXX-XX-page14.txt: [("'.", '.')]
Sligo19160501-VXX-XX-page24.txt: [("'.", '.')]
Sligo19160501-VXX-XX-page34.txt: [("'k", 'k'), ('\'oki.".', 'oki.".'), ("'..", '..')]
Sligo19160501-VXX-XX-page66.txt: [("'tfti", 'tfti')]
Sligo19160501-VXX-XX-page78.txt: [("'tween", 'tween'), ("'Till", 'Till')]
Sligo19161101-V01-05-page28.txt: [("'Fteg.", 'Fteg.')]
Sligo19161101-V01-05-page3.txt: [("'.", '.')]
Sligo19161201-V01-06-page11.txt: [("'round", 'round')]
Sligo19170101-V01-07-page19.txt: [("'ere", 'ere')]
Sligo19170101-V01-07-page24.txt: [("'The", 'The'), ("'Reg", 'Reg')]
Sligo19170201-V01-08-page11.txt: [("'Alice", 'Alice')]
Sligo19170201-V01-08-page12.txt: [("'em", 'em')]
Sligo19170201-V01-08-page15.txt: [("'cause", 'cause')]
Sligo19170201-V01-08-page24.txt: [("'Bout", 'Bout')]
Sligo19170201-V01-08-page25.txt: [("'em", 'em')]
Sligo19170301-V01-09-page12.txt: [("'Tis", 'Tis')]
Sligo19170301-V01-09-page15.txt: [("'tis", 'tis'), ("'spose", 'spose')]
Sligo19170301-V01-09-page32.txt: [("'Ras.", 'Ras.')]
Sligo19170501-V02-02-page48.txt: [("'gainst", 'gainst'), ("'heath", 'heath')]
Sligo19170501-V02-02-page51.txt: [('\'"', '"')]
Sligo19170501-V02-02-page59.txt: [("'Re", 'Re'), ("'Re.", 'Re.')]
Sligo19170501-V02-02-page66.txt: [("'tatirnwri", 'tatirnwri')]
Sligo19180501-V03-02-page14.txt: [("'s", 's')]
Sligo19180501-V03-02-page60.txt: [("'S", 'S')]
Sligo19180501-V03-02-page73.txt: [("'Reg.", 'Reg.')]
Sligo19181101-V03-04-page18.txt: [("'history", 'history')]
Sligo19181101-V03-04-page24.txt: [("'Tis", 'Tis'), ("'tis", 'tis')]
Sligo19181101-V03-04-page6.txt: [("'WM", 'WM')]
Sligo19181201-V03-05-page11.txt: [("'s", 's')]
Sligo19181201-V03-05-page18.txt: [("'tis", 'tis')]
Sligo19181201-V03-05-page9.txt: [("'a", 'a')]
Sligo19190201-V03-05-page11.txt: [("'a", 'a'), ("'Jut", 'Jut')]
Sligo19190201-V03-05-page13.txt: [("'year", 'year'), ("'Alone", 'Alone'), ("'Alone", 'Alone')]
Sligo19190201-V03-05-page18.txt: [("'master", 'master')]
Sligo19190201-V03-05-page30.txt: [("'Reg.", 'Reg.'), ("'Rc.g.", 'Rc.g.')]
Sligo19190201-V03-05-page9.txt: [("'the", 'the'), ("'What", 'What')]
Sligo19190201-V03-06-page13.txt: [("'year", 'year'), ("'Alone", 'Alone'), ("'Alone", 'Alone')]
Sligo19190201-V03-06-page19.txt: [("'Imre", 'Imre'), ("'Plane", 'Plane')]
Sligo19190201-V03-06-page9.txt: [("'the", 'the'), ("'What", 'What')]
Sligo19190201-V04-05-page10.txt: [("'Twas", 'Twas'), ("'Gaston", 'Gaston')]
Sligo19190201-V04-05-page12.txt: [("'personally", 'personally')]
Sligo19190201-V04-05-page16.txt: [("'xams", 'xams'), ("'xam", 'xam')]
Sligo19190201-V04-05-page17.txt: [("'Social", 'Social')]
Sligo19190201-V04-05-page2.txt: [("'bring", 'bring')]
Sligo19190201-V04-05-page6.txt: [("'cause", 'cause')]
Sligo19190201-V04-05-page7.txt: [("'Herbert", 'Herbert')]
Sligo19190201-V04-05-page8.txt: [("'In", 'In')]
Sligo19190201-V04-05-page9.txt: [("'Tis", 'Tis')]
Sligo19190301-V03-07-page19.txt: [("'Ito", 'Ito')]
Sligo19190301-V03-07-page28.txt: [("'He", 'He'), ("'Lives", 'Lives')]
Sligo19190301-V03-07-page31.txt: [("'Molly", 'Molly')]
Sligo19190301-V03-07-page36.txt: [("'Y", 'Y')]
Sligo19190301-V03-07-page6.txt: [("'Lift", 'Lift')]
Sligo19190401-V03-08-page12.txt: [("'Neath", 'Neath')]
Sligo19190401-V03-08-page15.txt: [("'human", 'human')]
Sligo19190401-V03-08-page20.txt: [("'WHEN", 'WHEN')]
Sligo19190401-V03-08-page28.txt: [("''SOMETHING", 'SOMETHING')]
Sligo19190401-V03-08-page32.txt: [("'Mama", 'Mama')]
Sligo19190401-V04-06-page13.txt: [("'Prudence", 'Prudence'), ("'Charity", 'Charity')]
Sligo19190401-V04-06-page17.txt: [("'View", 'View'), ("'for", 'for')]
Sligo19190401-V04-06-page2.txt: [("'uitS", 'uitS')]
Sligo19190401-V04-06-page36.txt: [("'I", 'I')]
Sligo19190401-V04-06-page5.txt: [('\'bide."', 'bide."'), ('\'bide."', 'bide."')]
Sligo19190401-V04-06-page9.txt: [("'miscellaneous", 'miscellaneous')]
Sligo19190401-V04-07-page14.txt: [("'mpossible", 'mpossible')]
Sligo19190401-V04-07-page25.txt: [("'s", 's')]
Sligo19190401-V04-07-page29.txt: [("'Z", 'Z'), ("'BE", 'BE'), ("''.''..", '...')]
Sligo19190401-V04-07-page3.txt: [("'Rte.", 'Rte.')]
Sligo19190401-V04-07-page33.txt: [("'PENNA.", 'PENNA.')]
Sligo19190401-V04-07-page36.txt: [("'t", 't')]
Sligo19190401-V04-07-page8.txt: [("'Tis", 'Tis')]
Sligo19191001-V04-01-page20.txt: [("'Tis", 'Tis')]
Sligo19191001-V04-01-page22.txt: [("'Miss", 'Miss')]
Sligo19191001-V04-01-page28.txt: [("'gamin", 'gamin')]
Sligo19191001-V04-01-page31.txt: [("'.", '.')]
Sligo19191001-V04-01-page5.txt: [("'pother", 'pother')]
Sligo19191101-V04-02-page11.txt: [("'acuity", 'acuity')]
Sligo19191101-V04-02-page20.txt: [("'r", 'r')]
Sligo19191101-V04-02-page8.txt: [("'acuity.", 'acuity.')]
Sligo19201001-V05-01-page11.txt: [("'Twill", 'Twill'), ("'Twill", 'Twill')]
Sligo19201001-V05-01-page14.txt: [("'his", 'his')]
Sligo19201001-V05-01-page16.txt: [("'You", 'You'), ("'I", 'I'), ("'You", 'You'), ("'We're", 'Were'), ("'Ah", 'Ah'), ("'tis", 'tis')]
Sligo19201101-V05-02-page1.txt: [("'T", 'T')]
Sligo19201101-V05-02-page31.txt: [("'Reg.", 'Reg.'), ("'Reg.", 'Reg.')]
Sligo19201101-V05-02-page9.txt: [("'good", 'good')]
Sligo19201201-V05-03-page18.txt: [("'to", 'to')]
Sligo19201201-V05-03-page31.txt: [("'Reg", 'Reg')]
Sligo19201201-V05-03-page8.txt: [("'ere", 'ere')]

Correction 5 -- Rejoin Split Words

In [23]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
Sligo19160401-V01-01-page17.txt: [('Mc', 'Manus')]
Sligo19160501-VXX-XX-page32.txt: [('SLIGON', 'IAN')]
Sligo19160501-VXX-XX-page41.txt: [('Winnif', 'red')]
Sligo19160501-VXX-XX-page66.txt: [('ri', 'A')]
Sligo19160501-VXX-XX-page78.txt: [('Na', 'ture'), ("E'", 'er'), ('Il', 'a')]
Sligo19160501-VXX-XX-page8.txt: [('ti', 'm')]
Sligo19160901-V01-03,04-page32.txt: [('CHIL', 'DREN')]
Sligo19160901-V01-03,04-page7.txt: [('HM', 'M')]
Sligo19160901-V01-03,04-page9.txt: [('SLIGON', 'IAN')]
Sligo19161101-V01-05-page30.txt: [('CHIL', 'DREN')]
Sligo19170101-V01-07-page18.txt: [('gli', 'M')]
Sligo19170201-V01-08-page12.txt: [('SLIGON', 'IAN')]
Sligo19170201-V01-08-page13.txt: [('SLIGON', 'IAN')]
Sligo19170201-V01-08-page19.txt: [('SLIGON', 'IAN')]
Sligo19170201-V01-08-page21.txt: [('SLIGON', 'IAN')]
Sligo19170301-V01-09-page7.txt: [('SLI', 'GONIAN'), ('SLIGON', 'IAN')]
Sligo19170501-V02-02-page14.txt: [('co', 'operated')]
Sligo19170501-V02-02-page31.txt: [('Pa', 'ge')]
Sligo19170901-V02-03,04-page13.txt: [('SLIGON', 'IAN')]
Sligo19170901-V02-03,04-page19.txt: [('SLIGON', 'IAN')]
Sligo19170901-V02-03,04-page22.txt: [('SLIGON', 'IAN')]
Sligo19170901-V02-03,04-page24.txt: [('wh', 'y')]
Sligo19170901-V02-03,04-page36.txt: [('LI', 'THE')]
Sligo19170901-V02-03,04-page6.txt: [('SLIGON', 'IAN')]
Sligo19180501-V03-02-page25.txt: [('Alberts', 'worth')]
Sligo19180501-V03-02-page62.txt: [('Barto', 'N')]
Sligo19180501-V03-02-page79.txt: [('Watchm', 'an')]
Sligo19181101-V03-04-page13.txt: [('SLIGON', 'IAN')]
Sligo19181101-V03-04-page14.txt: [('heav', 'y'), ('thi', 's')]
Sligo19181101-V03-04-page2.txt: [('SLI', 'GONIAN')]
Sligo19181101-V03-04-page22.txt: [('Se', 'c')]
Sligo19181101-V03-04-page27.txt: [('RALS', 'TON')]
Sligo19181101-V03-04-page33.txt: [('co', 'operation')]
Sligo19181101-V03-04-page4.txt: [('ra', 'se')]
Sligo19181101-V03-04-page5.txt: [('PA', 'P'), ('LI', 't')]
Sligo19181201-V03-05-page11.txt: [('SLIGON', 'IAN')]
Sligo19181201-V03-05-page15.txt: [('SLIGON', 'IAN')]
Sligo19181201-V03-05-page18.txt: [('ALU', 'M')]
Sligo19181201-V03-05-page19.txt: [('SLIGON', 'IAN')]
Sligo19181201-V03-05-page34.txt: [('RALS', 'TON')]
Sligo19190201-V03-05-page19.txt: [('ALU', 'M')]
Sligo19190201-V03-05-page23.txt: [('SLIGON', 'IAN')]
Sligo19190201-V03-05-page30.txt: [('LI', 'S')]
Sligo19190201-V03-05-page34.txt: [('RALS', 'TON')]
Sligo19190201-V03-05-page6.txt: [('SLIGON', 'IAN')]
Sligo19190201-V03-06-page15.txt: [('wor', 'ld')]
Sligo19190201-V03-06-page16.txt: [('SLIGON', 'IAN')]
Sligo19190201-V03-06-page23.txt: [('SLIGON', 'IAN')]
Sligo19190201-V03-06-page26.txt: [('SLIGON', 'IAN')]
Sligo19190201-V03-06-page29.txt: [('SLIGON', 'IAN')]
Sligo19190201-V03-06-page34.txt: [('RALS', 'TON')]
Sligo19190201-V04-05-page16.txt: [('pre', 'judice')]
Sligo19190201-V04-05-page24.txt: [('pre', 'paring')]
Sligo19190201-V04-05-page34.txt: [('Garmen', 'ts')]
Sligo19190301-V03-07-page38.txt: [('SLIGON', 'IAN')]
Sligo19190301-V03-07-page41.txt: [('TAKOM', 'A')]
Sligo19190301-V03-07-page48.txt: [('RALS', 'TON')]
Sligo19190401-V03-08-page11.txt: [('SLIGON', 'IAN')]
Sligo19190401-V03-08-page28.txt: [('TAKOM', 'A')]
Sligo19190401-V03-08-page34.txt: [('RALS', 'TON')]
Sligo19190401-V03-08-page6.txt: [('SLIGON', 'IAN')]
Sligo19190401-V03-08-page7.txt: [('SLIGON', 'IAN')]
Sligo19190401-V04-06-page11.txt: [('ab', 'e')]
Sligo19190401-V04-06-page30.txt: [('Washi', 'ng')]
Sligo19190401-V04-07-page12.txt: [('es', 'to')]
Sligo19190401-V04-07-page22.txt: [('SLIGON', 'IAN')]
Sligo19191001-V04-01-page12.txt: [('TH', 'E')]
Sligo19191001-V04-01-page16.txt: [('SLIGON', 'IAN')]
Sligo19191001-V04-01-page32.txt: [('Wai', 'sts')]
Sligo19191001-V04-01-page33.txt: [('PRE', 'PARED')]
Sligo19191001-V04-01-page34.txt: [('EX', 'AMINED')]
Sligo19191001-V04-01-page9.txt: [('SLIGON', 'IAN')]
Sligo19191101-V04-02-page11.txt: [('ma', 'y')]
Sligo19191101-V04-02-page16.txt: [('SLI', 'GONIAN')]
Sligo19191101-V04-02-page20.txt: [('holi', 'day')]
Sligo19191101-V04-02-page24.txt: [('th', 'at')]
Sligo19191101-V04-02-page34.txt: [('EX', 'AMINED')]
Sligo19201001-V05-01-page10.txt: [('SLIGON', 'IAN')]
Sligo19201001-V05-01-page16.txt: [('co', 'operation')]
Sligo19201001-V05-01-page2.txt: [('co', 'worker')]
Sligo19201001-V05-01-page21.txt: [('SLIGON', 'IAN')]
Sligo19201001-V05-01-page23.txt: [('Broa', 'dview')]
Sligo19201101-V05-02-page1.txt: [('AY', 'r')]
Sligo19201101-V05-02-page10.txt: [('al', 'good'), ('SLIGON', 'IAN')]
Sligo19201101-V05-02-page14.txt: [('al', 'A')]
Sligo19201101-V05-02-page17.txt: [('SLIGON', 'IAN')]
Sligo19201101-V05-02-page36.txt: [('co', 'ordinated')]
Sligo19201201-V05-03-page20.txt: [('princip', 'e')]
Sligo19201201-V05-03-page31.txt: [('RI', 'CHTER')]
Sligo19201201-V05-03-page36.txt: [('co', 'operation')]
In [24]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/correction5

Average verified rate: 0.9693109936262353

Average of error rates: 0.06974901960784315

Total token count: 290723

In [25]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[25]:
[("'", 918),
 ('d', 700),
 ('m', 670),
 ('w', 668),
 ('e', 390),
 ('n', 291),
 ('r', 254),
 ('f', 244),
 ('g', 234),
 ('th', 177),
 ('t', 116),
 ('co', 99),
 ('k', 46),
 ('schwab', 30),
 ('u', 27),
 ('mt', 23),
 ('mattingly', 22),
 ('pa', 22),
 ('kuppenheimer', 20),
 ('editor-in-chief', 19),
 ("painters'", 19),
 ('va', 19),
 ('ph', 17),
 ('x', 15),
 ('kamoda', 14),
 ('z', 14),
 ('styleplus', 14),
 ('lippart', 14),
 ('dietel', 13),
 ('rebok', 13),
 ('herzog', 13),
 ('flather', 12),
 ('kimonas', 12),
 ('gonian', 11),
 ('friedlander', 11),
 ('ailes', 11),
 ('ahrens', 11),
 ('ei', 11),
 ('chesnutt', 11)]

Correction 6 -- Rejoin Split Words II

In [26]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
Sligo19160401-V01-01-page20.txt: [('WASH', 'INGTON')]
Sligo19160401-V01-01-page5.txt: [('the', 'ES'), ('B', 'EE')]
Sligo19160501-VXX-XX-page74.txt: [('WASH', 'INGTON')]
Sligo19160501-VXX-XX-page78.txt: [('b', 'Rr'), ('loy', 'al'), ('ar', 'butus')]
Sligo19160501-VXX-XX-page8.txt: [('It', 'Al')]
Sligo19160501-VXX-XX-page88.txt: [('w', 'ww')]
Sligo19160901-V01-03,04-page17.txt: [('r', 'ut')]
Sligo19160901-V01-03,04-page31.txt: [('W', 'ASHINGTON')]
Sligo19161101-V01-05-page23.txt: [('WASH', 'INGTON')]
Sligo19161201-V01-06-page14.txt: [('WASH', 'INGTON')]
Sligo19170101-V01-07-page5.txt: [('a', 'MIE')]
Sligo19170201-V01-08-page26.txt: [('B', 'ES')]
Sligo19170201-V01-08-page27.txt: [('B', 'ES')]
Sligo19170501-V02-02-page5.txt: [('I', 'lL')]
Sligo19170501-V02-02-page64.txt: [('T', 'rade')]
Sligo19180501-V03-02-page30.txt: [('WILL', 'IAMS')]
Sligo19180501-V03-02-page67.txt: [('inform', 'ation')]
Sligo19180501-V03-02-page76.txt: [('SIM', 'PSON')]
Sligo19181101-V03-04-page33.txt: [('A', 'ND')]
Sligo19181101-V03-04-page4.txt: [('p', 'eace'), ('ra', 'se')]
Sligo19190201-V03-05-page2.txt: [('the', 'TA')]
Sligo19190201-V03-06-page2.txt: [('the', 'TA')]
Sligo19190201-V03-06-page8.txt: [('LA', 'IC')]
Sligo19190201-V04-05-page26.txt: [('con', 'tinually')]
Sligo19190201-V04-05-page33.txt: [('PROM', 'ENADE')]
Sligo19190301-V03-07-page43.txt: [('the', 'TA')]
Sligo19190401-V03-08-page30.txt: [('the', 'TA')]
Sligo19190401-V04-06-page12.txt: [('e', "nemy's")]
Sligo19190401-V04-06-page33.txt: [('PROM', 'ENADE')]
Sligo19190401-V04-07-page33.txt: [('PROM', 'ENADE')]
Sligo19190401-V04-07-page36.txt: [('t', 'ie'), ('a', 'ny')]
Sligo19190401-V04-07-page8.txt: [('leis', 'urely')]
Sligo19191001-V04-01-page17.txt: [('The', 'ES')]
Sligo19191001-V04-01-page28.txt: [('DIAG', 'NOSTIC')]
Sligo19191001-V04-01-page34.txt: [('SIG', 'HT')]
Sligo19191101-V04-02-page1.txt: [('i', 'Bo')]
Sligo19191101-V04-02-page26.txt: [('former', 'ly')]
Sligo19201001-V05-01-page23.txt: [('Broa', 'dview')]
Sligo19201101-V05-02-page16.txt: [('r', 'ENT')]
In [27]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/correction6

Average verified rate: 0.9694383383042474

Average of error rates: 0.06953235294117648

Total token count: 290691

In [28]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[28]:
[("'", 918),
 ('d', 700),
 ('m', 670),
 ('w', 667),
 ('e', 389),
 ('n', 291),
 ('r', 252),
 ('f', 244),
 ('g', 234),
 ('th', 177),
 ('t', 114),
 ('co', 99),
 ('k', 46),
 ('schwab', 30),
 ('u', 27),
 ('mt', 23),
 ('mattingly', 22),
 ('pa', 22),
 ('kuppenheimer', 20),
 ('editor-in-chief', 19),
 ("painters'", 19),
 ('va', 19),
 ('ph', 17),
 ('x', 15),
 ('kamoda', 14),
 ('z', 14),
 ('styleplus', 14),
 ('lippart', 14),
 ('dietel', 13),
 ('rebok', 13),
 ('herzog', 13),
 ('flather', 12),
 ('kimonas', 12),
 ('gonian', 11),
 ('friedlander', 11),
 ('ailes', 11),
 ('ahrens', 11),
 ('ei', 11),
 ('chesnutt', 11)]

Review Remaining Errors

In [29]:
reports.docs_with_high_error_rate(summary)
Out[29]:
[('Sligo19190401-V03-08-page36.txt', 1.0),
 ('Sligo19170501-V02-02-page33.txt', 1.0),
 ('Sligo19170501-V02-02-page9.txt', 1.0),
 ('Sligo19170501-V02-02-page11.txt', 1.0),
 ('Sligo19170501-V02-02-page29.txt', 1.0),
 ('Sligo19160501-VXX-XX-page51.txt', 1.0),
 ('Sligo19160501-VXX-XX-page56.txt', 1.0),
 ('Sligo19170501-V02-02-page51.txt', 1.0),
 ('Sligo19190301-V03-07-page27.txt', 1.0),
 ('Sligo19161101-V01-05-page33.txt', 1.0),
 ('Sligo19160501-VXX-XX-page16.txt', 0.889),
 ('Sligo19181101-V03-04-page3.txt', 0.871),
 ('Sligo19170501-V02-02-page6.txt', 0.8),
 ('Sligo19160501-VXX-XX-page14.txt', 0.75),
 ('Sligo19190301-V03-07-page1.txt', 0.714),
 ('Sligo19201101-V05-02-page1.txt', 0.714),
 ('Sligo19160501-VXX-XX-page66.txt', 0.706),
 ('Sligo19160501-VXX-XX-page76.txt', 0.692),
 ('Sligo19160501-VXX-XX-page25.txt', 0.643),
 ('Sligo19160501-VXX-XX-page30.txt', 0.615),
 ('Sligo19190301-V03-07-page51.txt', 0.6),
 ('Sligo19170501-V02-02-page18.txt', 0.6),
 ('Sligo19160501-VXX-XX-page34.txt', 0.591),
 ('Sligo19160501-VXX-XX-page44.txt', 0.556),
 ('Sligo19191001-V04-01-page1.txt', 0.522),
 ('Sligo19160901-V01-03,04-page1.txt', 0.5),
 ('Sligo19170501-V02-02-page21.txt', 0.5),
 ('Sligo19180501-V03-02-page32.txt', 0.5),
 ('Sligo19170501-V02-02-page5.txt', 0.5),
 ('Sligo19160501-VXX-XX-page8.txt', 0.5),
 ('Sligo19160501-VXX-XX-page2.txt', 0.5),
 ('Sligo19190301-V03-07-page4.txt', 0.5),
 ('Sligo19181101-V03-04-page4.txt', 0.43),
 ('Sligo19160501-VXX-XX-page11.txt', 0.412),
 ('Sligo19190401-V04-07-page29.txt', 0.406),
 ('Sligo19180501-V03-02-page56.txt', 0.381),
 ('Sligo19180501-V03-02-page13.txt', 0.375),
 ('Sligo19170501-V02-02-page4.txt', 0.333),
 ('Sligo19181201-V03-05-page1.txt', 0.333),
 ('Sligo19190401-V03-08-page1.txt', 0.333),
 ('Sligo19180501-V03-02-page8.txt', 0.333),
 ('Sligo19160501-VXX-XX-page75.txt', 0.321),
 ('Sligo19190301-V03-07-page16.txt', 0.308),
 ('Sligo19170501-V02-02-page8.txt', 0.302),
 ('Sligo19180501-V03-02-page58.txt', 0.3),
 ('Sligo19160501-VXX-XX-page28.txt', 0.297),
 ('Sligo19160501-VXX-XX-page13.txt', 0.296),
 ('Sligo19190301-V03-07-page23.txt', 0.286),
 ('Sligo19180501-V03-02-page39.txt', 0.286),
 ('Sligo19170501-V02-02-page36.txt', 0.274),
 ('Sligo19160501-VXX-XX-page27.txt', 0.269),
 ('Sligo19170501-V02-02-page27.txt', 0.269),
 ('Sligo19170501-V02-02-page23.txt', 0.269),
 ('Sligo19180501-V03-02-page57.txt', 0.267),
 ('Sligo19160501-VXX-XX-page32.txt', 0.262),
 ('Sligo19180501-V03-02-page6.txt', 0.259),
 ('Sligo19170501-V02-02-page19.txt', 0.259),
 ('Sligo19170501-V02-02-page31.txt', 0.247),
 ('Sligo19170501-V02-02-page10.txt', 0.244),
 ('Sligo19170301-V01-09-page31.txt', 0.239),
 ('Sligo19190201-V04-05-page3.txt', 0.239),
 ('Sligo19180501-V03-02-page36.txt', 0.238),
 ('Sligo19180501-V03-02-page9.txt', 0.224),
 ('Sligo19160501-VXX-XX-page15.txt', 0.214),
 ('Sligo19170201-V01-08-page30.txt', 0.213),
 ('Sligo19160501-VXX-XX-page86.txt', 0.213),
 ('Sligo19170501-V02-02-page32.txt', 0.211),
 ('Sligo19170301-V01-09-page3.txt', 0.207),
 ('Sligo19201001-V05-01-page20.txt', 0.204),
 ('Sligo19170501-V02-02-page7.txt', 0.201)]
In [30]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.8]

utilities.open_original_docs(doc_keys, directories['cycle'])
Opened files: 

Sligo19190401-V03-08-page36.txt

Sligo19170501-V02-02-page33.txt

Sligo19170501-V02-02-page9.txt

Sligo19170501-V02-02-page11.txt

Sligo19170501-V02-02-page29.txt

Sligo19160501-VXX-XX-page51.txt

Sligo19160501-VXX-XX-page56.txt

Sligo19170501-V02-02-page51.txt

Sligo19190301-V03-07-page27.txt

Sligo19161101-V01-05-page33.txt

Sligo19160501-VXX-XX-page16.txt

Sligo19181101-V03-04-page3.txt

There are a number of blank pages in the periodical, where are being calculated as having a error rate of 1.

In [31]:
reports.long_errors(errors_summary, min_length=15)
Out[31]:
(["rliaalezt'rottok",
  'accessible--get-at-able',
  'aiiiiiimmoinuiumummininiiiiiiiiiiiiimmiiiiiiiiiiiiimmiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiniiiiiiiiiiiiiiminimmilliniiiiiiiiiiiiiiiiiiiiiiiiiimmiiiiiimmtiiiiiiiiirri',
  'photographicffrints',
  'themauricejoyceengravingc',
  'emiminwimiimiiiiiiimmunimmou',
  'wearenowreadywithourfull',
  'self-satisfaction',
  'themavricejoyceengravingc',
  'wednesday-morning',
  "editor-in-chi'ef",
  'thirty-horse-power',
  'missionary-in-themaking',
  'character-foundation',
  "students'association",
  'tammininnthimminumniiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimiiimminiiiiiiiiiiiiiiiiiiiiiiiimmuiiimiiiiiiiiliiimmiiiiiiiiiiiiiiiiiiiiiiiiimmiiiiiiiimiiiiiiiiiiiiimmiiiiiiiiiiiira',
  'ninumumimmimmummunimmummumummunommimminiumiimiumumummumimmimumumomminimminummumummulillimmuumnig',
  'commander-in-chief',
  'ccosagzegoatteeztemewssms',
  'miimiiiinimimmiiiimiiimiimmomm',
  'seminary-sanitarium',
  'storm-bewildered',
  'secretarytreasurer',
  'commandmentkeeping',
  'forty-horse-power',
  'fourteenyear-old',
  'mitchell-mccutchen',
  'andsistermaxirno',
  'supplies-hospital',
  'twenty-cent-a-week',
  'exceptionallyhard',
  'exzeitsteasearzega',
  'eimmummimiiiiiiiiiiiiiimiiiimiiiiin',
  'szemesmaximeniwzmilanommescm',
  'self-sacrificirig',
  'winimmimummumiiihmmmunimmimmimminiminimminimimmimmimmiiimmumniunimiiiiiimmiimmermuminiiimmin',
  'superdreadnaught',
  'gerhart-schilling',
  'farmingimplements',
  'six-thousand-dollar',
  'straight-forward',
  'companyschenectady',
  'devil-worshipers',
  'eactepstomentaxermoismiscatemosa',
  'self-forgetfulness',
  'thrill-that-comesonce-in-a-lifetime',
  'secretary-treasurer',
  'rilimiimmummininimummumummummommumummimiummummunnumnimmilmmoiimmimmmmunimmunimminimnummumumme',
  'pattern-designer',
  'thesligonianwhen',
  'discouragernents',
  'sandborn-renninger',
  'woodwardandlothrop',
  'thirst-engendering',
  'iilmammemimmimim',
  'sub-corresponding',
  'stick-to-it-iveness',
  'eight-hundred-word',
  'aiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimminninsiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiliiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimiliiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiifi',
  'get-at-able--when',
  'eildimitrottcdkz',
  "ladies'andmen'sgarments",
  'rastontowiemitationt',
  'themaricljoycengningc',
  'schillberg-guild',
  'director-general',
  'up-to-the-minute',
  'washington-maryland',
  'knox-albertsworth',
  'manufacturingresources',
  'aimminmommimummumumummunumitimmiumniumiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimminiummiiiiiiiiiiiiiiiimmiummiiiimminiiiimmummummimunni',
  'wiiiiimmimmiiiiiiiiiiiiiiiim',
  'dirkmajouttkuvocco',
  'selfaggrandizement',
  'aminanummmammoimommmumamminummmuumaimmumaumumaumumnmummuninniminimmuumammuimummumumnomm',
  'toauricejoyceengravinco',
  'synchronological',
  'conscienceappealing',
  'hungrier-looking',
  'house-furnishings',
  'student-missionary',
  'cemtwenxsmspigataasaistmozmoca',
  'heiledzmvztikdko',
  'sphygmonanometers',
  'september-october',
  'tiinuricejoyceengravingc',
  'nalinamammummumuthimmuniminimmiiiiiiiffillifilamilliffilliaffiallaaaallifillminiiiiiiiiammiiiiiiiiimmiimmlimmammiummommimam',
  'lhidthesligonianel',
  'accessible--get-at-able--when',
  'neeheeneemeiieemeeininienienerninememelliiiieiniiiiiliiiiiiniiniieemenemeeemhieemememo'],
 15)

Correction 7 -- Remove Long Errors with Repeating Characters

In [32]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = "correction6"
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    replacements.append(clean.check_for_repeating_characters(tokens, "m|M"))
    replacements.append(clean.check_for_repeating_characters(tokens, "i|I"))
    
    if len(replacements) > 1:
        replacements = [item for sublist in replacements for item in sublist]
            
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
Sligo19160401-V01-01-page7.txt: [('aiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimminninsiiiiiiiIiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiliiiiiiiiiiiiiiiiiiiiiiiIiiiiiiiiiiiiimiliiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiifi', ' ')]
Sligo19160901-V01-03,04-page7.txt: [('rilimiimmummininimummumummummommumummimiummummunnumnimmilmmoiimmimmmmunimmunimminimnummumumme', ' ')]
Sligo19161101-V01-05-page22.txt: [('MIIMIIIINIMIMMIIIIMIIIMIIMMOMM', ' '), ('EMIMINWIMIIMIIIIIIIMMUNIMMOU', ' '), ('MIIMIIIINIMIMMIIIIMIIIMIIMMOMM', ' '), ('NEEHEENEEMEIIEEMEEininiEniEnErniNEMEMElliiiiEiniiiiiliiiiiiniiniiEEMENEMEEEMHIEEMEMEMO', ' ')]
Sligo19161201-V01-06-page7.txt: [('Ninumumimmimmummunimmummumummunommimminiumiimiumumummumimmimumumomminimminummumummulillimmuumnig', ' '), ('tammininnthimminumniiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimiiimminiiiiiiiiiiiiiiiiiiiiiiiimmuiiimiiiiiiiiliiimmiiiiiiiiiiiiiiiiiiiiiiiiimmiiiiiiiimiiiiiiiiiiiiimmiiiiiiiiiiiira', ' '), ('tammininnthimminumniiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimiiimminiiiiiiiiiiiiiiiiiiiiiiiimmuiiimiiiiiiiiliiimmiiiiiiiiiiiiiiiiiiiiiiiiimmiiiiiiiimiiiiiiiiiiiiimmiiiiiiiiiiiira', ' ')]
Sligo19170101-V01-07-page7.txt: [('nalinamammummumuthimmuniminimmiiiiiiiffillifilamilliffilliaffiallaaaallifillminiiiiiiiiammiiiiiiiiimmiimmlimmammiummommimam', ' '), ('aimminmommimummumumummunumitimmiumniumiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimminiummiiiiiiiiiiiiiiiimmiummiiiimminiiiimmummummimunni', ' '), ('nalinamammummumuthimmuniminimmiiiiiiiffillifilamilliffilliaffiallaaaallifillminiiiiiiiiammiiiiiiiiimmiimmlimmammiummommimam', ' '), ('aimminmommimummumumummunumitimmiumniumiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimminiummiiiiiiiiiiiiiiiimmiummiiiimminiiiimmummummimunni', ' ')]
Sligo19170201-V01-08-page24.txt: [('aiiiiiimmoinuiumummininiiiiiiiiiiiiimmiiiiiiiiiiiiimmiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiniiiiiiiiiiiiiiminimmilliniiiiiiiiiiiiiiiiiiiiiiiiiimmiiiiiimmtiiiiiiiiirri', ' '), ('aiiiiiimmoinuiumummininiiiiiiiiiiiiimmiiiiiiiiiiiiimmiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiniiiiiiiiiiiiiiminimmilliniiiiiiiiiiiiiiiiiiiiiiiiiimmiiiiiimmtiiiiiiiiirri', ' ')]
Sligo19170201-V01-08-page7.txt: [('aminanummmammoimommmumamminummmuumaimmumaumumaumumnmummuninniminimmuumammuimummumumnomm', ' '), ('EIMMUMMIMIIIIIIIIIIIIIIMIIIIMIIIIIN', ' '), ('WIIIIIMMIMMIIIIIIIIIIIIIIIIM', ' ')]
Sligo19170301-V01-09-page7.txt: [('WINIMMIMUMMUMIIIHMMMUNIMMIMMIMMINIMINIMMINIMIMMIMMIMMIIIMMUMNIUNIMIIIIIIMMIIMMERMUMINIIIMMIN', ' '), ('WINIMMIMUMMUMIIIHMMMUNIMMIMMIMMINIMINIMMINIMIMMIMMIMMIIIMMUMNIUNIMIIIIIIMMIIMMERMUMINIIIMMIN', ' ')]
In [33]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/correction7

Average verified rate: 0.9694850297753176

Average of error rates: 0.06949411764705882

Total token count: 290677

In [34]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[34]:
[("'", 918),
 ('d', 700),
 ('m', 670),
 ('w', 667),
 ('e', 389),
 ('n', 291),
 ('r', 252),
 ('f', 244),
 ('g', 234),
 ('th', 177),
 ('t', 114),
 ('co', 99),
 ('k', 46),
 ('schwab', 30),
 ('u', 27),
 ('mt', 23),
 ('mattingly', 22),
 ('pa', 22),
 ('kuppenheimer', 20),
 ('editor-in-chief', 19),
 ("painters'", 19),
 ('va', 19),
 ('ph', 17),
 ('x', 15),
 ('kamoda', 14),
 ('z', 14),
 ('styleplus', 14),
 ('lippart', 14),
 ('dietel', 13),
 ('rebok', 13),
 ('herzog', 13),
 ('flather', 12),
 ('kimonas', 12),
 ('gonian', 11),
 ('friedlander', 11),
 ('ailes', 11),
 ('ahrens', 11),
 ('ei', 11),
 ('chesnutt', 11)]

Correction 8 -- Split Squashed Words

In [35]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = "correction7"
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
    maxword = max(len(x) for x in sorted_list_of_words)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
Sligo19160901-V01-03,04-page5.txt: [('LHIDTHESLIGONIANEl', 'L H I D T H E S L I G O N I A N E l')]
Sligo19181101-V03-04-page27.txt: [('WoodwardandLothrop', 'Woodward and Lothrop')]
Sligo19181201-V03-05-page34.txt: [('WoodwardandLothrop', 'Woodward and Lothrop')]
Sligo19190301-V03-07-page48.txt: [('WoodwardandLothrop', 'Woodward and Lothrop')]
Sligo19190401-V03-08-page3.txt: [('conscienceappealing', 'conscience appealing')]
Sligo19190401-V03-08-page34.txt: [('WoodwardandLothrop', 'Woodward and Lothrop')]
Sligo19191001-V04-01-page2.txt: [('WoodwardandLothrop', 'Woodward and Lothrop')]
Sligo19191001-V04-01-page33.txt: [('WEARENOWREADYWITHOURFULL', 'WE ARE NOW READY WITH OUR FULL')]
Sligo19201001-V05-01-page24.txt: [('secretarytreasurer', 'secretary treasurer')]
Sligo19201201-V05-03-page31.txt: [('THEMAURICEJOYCEENGRAVINGC', 'THE MAURICE JOYCE ENGRAVING C')]
Sligo19201201-V05-03-page36.txt: [('CompanySchenectady', 'Company Schenectady')]
In [36]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/Sligo/correction8

Average verified rate: 0.9695029874413948

Average of error rates: 0.06944509803921568

Total token count: 290717

In [37]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[37]:
[("'", 918),
 ('d', 701),
 ('m', 670),
 ('w', 667),
 ('e', 391),
 ('n', 293),
 ('r', 252),
 ('f', 244),
 ('g', 235),
 ('th', 177),
 ('t', 115),
 ('co', 99),
 ('k', 46),
 ('schwab', 30),
 ('u', 27),
 ('mt', 23),
 ('mattingly', 22),
 ('pa', 22),
 ('kuppenheimer', 20),
 ('editor-in-chief', 19),
 ("painters'", 19),
 ('va', 19),
 ('ph', 17),
 ('x', 15),
 ('kamoda', 14),
 ('z', 14),
 ('styleplus', 14),
 ('lippart', 14),
 ('dietel', 13),
 ('rebok', 13),
 ('herzog', 13),
 ('flather', 12),
 ('kimonas', 12),
 ('gonian', 11),
 ('friedlander', 11),
 ('ailes', 11),
 ('ahrens', 11),
 ('ei', 11),
 ('chesnutt', 11)]
In [ ]: