CE-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "CE"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/baseline

Average verified rate: 0.9354435814569106

Average of error rates: 0.0796414271047228

Total token count: 1970416

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 100 )
Out[11]:
[('ñ', 8629),
 ('¥', 5694),
 ('-', 4467),
 (')', 2257),
 ('e', 1546),
 ('(', 1535),
 ("'", 1489),
 ('m', 1419),
 ('tion', 1327),
 ('d', 1294),
 ('in-', 1144),
 ('+', 1089),
 ('re-', 1031),
 ('con-', 1015),
 ('de-', 783),
 ('w', 717),
 ('be-', 691),
 ('x', 659),
 ('r', 626),
 ('ex-', 588),
 ('g', 568),
 ('t', 539),
 ('com-', 539),
 ('ment', 534),
 ('n', 518),
 ('/', 501),
 ('f', 489),
 ('tions', 413),
 ('chil-', 403),
 ('dren', 394),
 ('ers', 382),
 ('en-', 380),
 ('pro-', 367),
 ('=', 364),
 ('*', 360),
 ('stu-', 324),
 ('teach-', 315),
 ('un-', 309),
 ('educa-', 302),
 ('edu-', 299),
 ('pre-', 298),
 ('ac-', 288),
 ('¥¥', 286),
 ('per-', 286),
 ('im-', 284),
 ('ture', 259),
 ('dis-', 257),
 ('++', 253),
 ('ence', 243),
 ('col-', 237),
 ('ad-', 236),
 ('ap-', 233),
 ('al-', 217),
 ('sub-', 202),
 ('an-', 201),
 ('at-', 199),
 ('ful', 197),
 ('ments', 188),
 ('•', 175),
 ('mis-', 174),
 ('tional', 170),
 ('_', 151),
 ('ple', 149),
 ('to-', 148),
 ('lege', 146),
 ('co', 146),
 ('inter-', 145),
 ('ob-', 144),
 ('--', 143),
 ('some-', 142),
 ('u', 140),
 ('chris-', 139),
 ('ber', 139),
 ('di-', 139),
 ('par-', 136),
 ('li', 136),
 ('for-', 135),
 ('train-', 130),
 ('na-', 129),
 ('ent', 127),
 ('les-', 126),
 ('cor-', 126),
 ('fol-', 125),
 ('prac-', 125),
 ('lan-', 125),
 ('(concluded', 124),
 ('as-', 123),
 ('pur-', 121),
 ('%', 120),
 ('ar-', 120),
 ('tian', 120),
 ('under-', 119),
 ('(see', 114),
 ('k', 114),
 ('his-', 113),
 ('prin-', 113),
 ('ques-', 112),
 ('se-', 111),
 ('man-', 110),
 ('gen-', 109),
 ('or-', 107),
 ('work-', 107),
 ('num-', 107),
 ('depart-', 106),
 ('read-', 104),
 ('pu-', 103),
 ('pos-', 103),
 ('sup-', 101),
 ('em-', 101),
 ('suc-', 101)]

Correction 1 -- Special Characters

In [12]:
reports.tokens_with_special_characters(errors_summary)
Out[12]:
[('ñ', 8629),
 ('¥', 5694),
 (')', 2257),
 ('(', 1535),
 ('+', 1089),
 ('/', 501),
 ('=', 364),
 ('*', 360),
 ('¥¥', 286),
 ('++', 253),
 ('•', 175),
 ('_', 151),
 ('(concluded', 124),
 ('%', 120),
 ('(see', 114),
 (']', 98),
 ('¥¥¥', 92),
 ('___', 82),
 ('numbers)', 80),
 ('\\', 80),
 ('(a)', 77),
 ('ó', 74),
 ('—', 73),
 ('(b)', 71),
 ('(fig', 70),
 ('ã', 69),
 ('(poem)', 66),
 ('[', 55),
 ('>', 54),
 ('(the', 51),
 ('`', 51),
 ('¡', 51),
 ('(study', 49),
 ('(poetry)', 43),
 ('¥¥¥¥', 41),
 ('(continued', 38),
 ('**', 37),
 ('(to', 37),
 ('(for', 34),
 ('ô', 33),
 ('(selections)', 33),
 ('(c)', 33),
 ('(native', 33),
 ('(colored)', 31),
 ('(or', 30),
 ('¥=', 30),
 ('[the', 29),
 ('ñthe', 28),
 ('\ufeff', 27),
 ('***', 27),
 ('(in', 27),
 ('(and', 26),
 ('<', 24),
 ('(i', 24),
 ('ers)', 23),
 ('*¥', 23),
 ('<>', 22),
 ('=¥', 22),
 ('m¥', 21),
 ('(with', 20),
 ('i)', 20),
 ('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++', 20),
 ('month)', 19),
 ('years)', 19),
 ('(not', 18),
 ('(a', 18),
 ('pictures)', 18),
 ('#', 18),
 ('(page', 18),
 ('(first', 17),
 ('(one', 17),
 ('(of', 16),
 ('~~', 16),
 ('part)', 16),
 ('(including', 16),
 ('*****', 16),
 ('(four', 16),
 ('¥i', 15),
 ('****', 15),
 ('¥¥¥¥¥', 15),
 ('(i)', 15),
 ('page)', 15),
 ('{', 15),
 ('(two', 14),
 ('♦', 14),
 ('(monthly', 14),
 ('(pages', 13),
 ('(as', 13),
 ('-¥', 13),
 ('(grade', 12),
 ('\\\\', 12),
 ('******', 12),
 ('_-', 12),
 ('work)', 12),
 ('ña', 12),
 ('ñid', 12),
 ('__', 12),
 ('¥m', 12),
 ('teachers)', 12),
 ('¥¥¥¥¥¥', 11),
 ('(affiliated', 11),
 ('-_', 11),
 ('(original', 11),
 ('¥o', 11),
 ('(d)', 11),
 ('versity)', 11),
 ('¥of', 11),
 ('(weekly)', 11),
 ('(shoulder', 10),
 ('(this', 10),
 ('ö', 10),
 ('¥*', 10),
 ('(vaud)', 10),
 ('¥>', 10),
 ('day)', 10),
 ('(contributed', 10),
 ('(at', 10),
 ('f+', 10),
 ('(touch', 10),
 ('ored)', 10),
 ('==', 10),
 ('[in', 10),
 ("'¥", 10),
 ('(col-', 10),
 ('(by', 10),
 ('days)', 10),
 ('(grades', 10),
 ('o)', 9),
 ('(gr', 9),
 ('(sketch', 9),
 ('times)', 9),
 ('____', 9),
 ('••', 9),
 ('*******', 9),
 ('ò', 9),
 ('¥a', 9),
 ('(if', 9),
 ('it)', 9),
 ('(fifth', 9),
 ('+f', 9),
 ('¥-', 9),
 ('(from', 9),
 ('[a', 8),
 ('n)', 8),
 ('do)', 8),
 ('ñmrs', 8),
 ('*o', 8),
 ('(concluded)', 8),
 ('of¥', 8),
 ('poem)', 8),
 ("'ñ", 8),
 ('ñno', 8),
 ('number)', 8),
 ('high)', 8),
 ('one)', 8),
 ('¥t', 8),
 ('■', 7),
 ('standard-%', 7),
 ('¥to', 7),
 ('<><>', 7),
 ('itsñ', 7),
 ('grades)', 7),
 ('kern)', 7),
 ('age)', 7),
 ('(that', 7),
 ('¥the', 7),
 ('(last', 7),
 ('//', 7),
 ('i/', 7),
 ('¥¥¥¥¥¥¥', 7),
 ('/m', 7),
 ('(educational)', 7),
 ('(luke', 7),
 ('*+', 7),
 ('school)', 7),
 ('(e', 7),
 ('(which', 7),
 ('grade)', 7),
 ('¥and', 6),
 ('**********', 6),
 ('ii)', 6),
 ('weeks)', 6),
 ('(show', 6),
 ('cents)', 6),
 ('m=', 6),
 ('¥¥¥¥¥¥¥¥¥', 6),
 ('(though', 6),
 ('the¥', 6),
 ('(matt', 6),
 ('********', 6),
 ('of)', 6),
 ('(up', 6),
 ('(prov', 6),
 ('t¥', 6),
 ('(r)', 6),
 ('(full', 6),
 ('+++', 6),
 ('i¥', 6),
 ('year)', 6),
 ('¨', 6),
 ('ñto', 6),
 ('inch)', 6),
 ('(e)', 5),
 ('^', 5),
 ('•••', 5),
 ("')", 5),
 ('(note', 5),
 ('(step', 5),
 ('¥in', 5),
 ('(music)', 5),
 ('—the', 5),
 ('(each', 5),
 ('ñthis', 5),
 ('¤', 5),
 ('(illustrate', 5),
 ('(on', 5),
 ('a¥', 5),
 ('++++++', 5),
 ('(short', 5),
 ('+¥', 5),
 ('accepted)', 5),
 ('(signed)', 5),
 ('[we', 5),
 ('(draw', 5),
 ('(white', 5),
 ('(ohio)', 5),
 ('(it', 5),
 ('(repeat', 5),
 ('course)', 5),
 ('ñstone-millis', 5),
 ('ñin', 5),
 ('(except', 5),
 ('¥+', 5),
 ('(t', 5),
 ('(song)', 5),
 ('hand)', 5),
 ('(these', 5),
 ('(subscriptions', 5),
 ('schools)', 5),
 ('%-', 5),
 ('¥¥=', 5),
 ('week)', 5),
 ('duplicated)', 5),
 ('=m', 5),
 ('¥n', 5),
 ('e¥', 5),
 ('cr)', 5),
 ('in¥', 5),
 ('(italics', 5),
 ('(john', 5),
 ('to¥', 5),
 ('(third', 4),
 ('more)', 4),
 ('(no', 4),
 ('r¥', 4),
 ('subheads)', 4),
 ('(use', 4),
 ('(about', 4),
 ('(g)', 4),
 ('months)', 4),
 ('¥¥¥¥¥¥¥¥', 4),
 ('(t)', 4),
 ('ñhenry', 4),
 ('r)', 4),
 ('ñed', 4),
 ('%-inch', 4),
 ('to)', 4),
 ('o¥', 4),
 ('(m', 4),
 ('ñreview', 4),
 ('¥¥¥¥¥¥¥¥¥¥¥', 4),
 ('¥our', 4),
 ('(more', 4),
 ('teacher]', 4),
 ('[this', 4),
 ('edition)', 4),
 ('ñ`', 4),
 ('✓', 4),
 ('(so', 4),
 ('c¥', 4),
 ('-%', 4),
 ('v)', 4),
 ('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++', 4),
 ('(keep', 4),
 ('(continued)', 4),
 ('first)', 4),
 ('name)', 4),
 ('()', 4),
 ('(read', 4),
 ('ãã', 4),
 ('teacher)', 4),
 ('(dark', 4),
 ('ñessays', 4),
 ('--¥', 4),
 ('-_-', 4),
 ('(you', 4),
 ('(all', 4),
 ('¥christian', 4),
 ('%-%', 4),
 ('feet)', 4),
 ('matter)', 4),
 ('(let', 4),
 ('(revised)', 4),
 ('/-', 4),
 ('(give', 4),
 ('books)', 4),
 ('c)', 4),
 ('(com-', 4),
 ('±', 4),
 ('(plate', 4),
 ('><>', 4),
 ('+i', 4),
 ('[of', 4),
 ('[from', 4),
 ('++++++++++++++++++++++++++++', 4),
 ('(make', 4),
 ('girls)', 4),
 ('bible)', 4),
 ('(be', 4),
 ('**************', 4),
 ('(second', 4),
 ('•••••', 4),
 ('training=school', 4),
 ('++++', 4),
 ('+++++', 4),
 ('use)', 4),
 ("ñ's", 4),
 ('(standing)', 4),
 ('(unless', 4),
 ('(bend', 3),
 ('light)', 3),
 ('gradeñirene', 3),
 ('---_', 3),
 ('°', 3),
 ('ones)', 3),
 ('\\i', 3),
 ('(strong', 3),
 ('/¥', 3),
 ('iii)', 3),
 ('ñjames', 3),
 ('(review)', 3),
 ('five)', 3),
 ('word)', 3),
 ('+++++++++', 3),
 ('preparation)', 3),
 ('all)', 3),
 ('ããã', 3),
 ('[oblong', 3),
 ('(perhaps', 3),
 ('[and', 3),
 ('paper)', 3),
 ('[all', 3),
 ('¥his', 3),
 ('face]', 3),
 ('***********', 3),
 ('ing)', 3),
 ('¥¥>', 3),
 ('e=', 3),
 ('[at', 3),
 ('college]', 3),
 ('+++++++', 3),
 ('home)', 3),
 ('ñmake', 3),
 ('(say', 3),
 ("¥'", 3),
 ('g)', 3),
 ('(new', 3),
 ('ñread', 3),
 ('gradeñedith', 3),
 ('(printed', 3),
 ('hands)', 3),
 ('(go', 3),
 ('r_', 3),
 ('that¥', 3),
 ('(gain)', 3),
 ('schoolñ', 3),
 ('(air', 3),
 ('others)', 3),
 ('[while', 3),
 ('(lesson', 3),
 ('is¥', 3),
 ('college)', 3),
 ('efficiencyñ', 3),
 ('¥-¥', 3),
 ("\\'", 3),
 ('(twelve', 3),
 ('(vol', 3),
 ('board)', 3),
 ('(weeks)', 3),
 ('(august)', 3),
 ('eight)', 3),
 ('canada)', 3),
 ('theñ', 3),
 ('(>', 3),
 ('(any', 3),
 ('step)', 3),
 ('units)', 3),
 ('(clay', 3),
 ('¥as', 3),
 ('=a', 3),
 ('(f)', 3),
 ('(rev', 3),
 ('(verse', 3),
 ('(doctor', 3),
 ('(figs', 3),
 ('++++++++', 3),
 ('¥is', 3),
 ('ñalice', 3),
 (')-', 3),
 ('(based', 3),
 ('=r', 3),
 ('h+', 3),
 ('ñen', 3),
 ('(but', 3),
 ('p¥', 3),
 ('ñwith', 3),
 ('ñwhen', 3),
 ('clause)', 3),
 ('ñeach', 3),
 ('(iii', 3),
 ('conference)', 3),
 ('(pupils', 3),
 ('cent)', 3),
 ('(year)', 3),
 ('[or', 3),
 ('homeñ', 3),
 ('ñeduca-', 3),
 ('¥mm', 3),
 ('(standard)', 3),
 ('iñ', 3),
 ('(cal', 3),
 ('february)', 3),
 ('-*', 3),
 ('(we', 3),
 ('f¥', 3),
 ('n/', 3),
 ('--_-', 3),
 ('us)', 3),
 ('(also', 3),
 ('are¥', 3),
 ('(week)', 3),
 ('addressñ', 3),
 ('tions)', 3),
 ('(she', 3),
 ('are)', 3),
 ('room)', 3),
 ('_a', 3),
 ('(carbohydrates', 3),
 ('(explain', 3),
 ('(book', 3),
 ('(eight', 3),
 ('<x>', 3),
 ('[may', 3),
 ('¥x', 3),
 ('students)', 3),
 ('(followed', 3),
 ('(there', 3),
 ('four)', 3),
 ('(number', 3),
 ('(part', 3),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥', 3),
 ('(acts', 3),
 ('¥ne', 3),
 ('¥e', 3),
 ('---_-', 3),
 ('(used', 3),
 ('[signed]', 3),
 ('ii/', 3),
 ('so¡', 3),
 ('(common', 3),
 ('and¥', 3),
 ('gradeñ', 3),
 ('=e', 3),
 ('(con-', 3),
 ('¥min', 3),
 ('*************************', 2),
 ('/r', 2),
 ('offeredñthat', 2),
 ('facultyñthorough', 2),
 ('mil*', 2),
 ('¥¥¥¥¥¥¥¥¥¥', 2),
 ('(mediterranean)', 2),
 ('`¥', 2),
 ('(he', 2),
 ('(commonly', 2),
 ('ti)', 2),
 ('(english', 2),
 ('ñe', 2),
 ('(how', 2),
 ('department)', 2),
 ('who¥', 2),
 ('[showing', 2),
 ('=_', 2),
 ('ii¡', 2),
 ('*aa', 2),
 ('(bible', 2),
 ('(*', 2),
 ('languageñ', 2),
 ('ñwhat', 2),
 ('*as', 2),
 ('go)', 2),
 ('i*', 2),
 ('three)', 2),
 ('<><><>', 2),
 ('plant)', 2),
 ('out)', 2),
 ('it¥', 2),
 ('+++++++++++', 2),
 ('nature)', 2),
 ('vol_', 2),
 ('_/', 2),
 ('(have', 2),
 ('pupils)', 2),
 ('z¥', 2),
 ('+-', 2),
 ('(intermediate', 2),
 ('_____', 2),
 ('ours)', 2),
 ('¥we', 2),
 ('(ii', 2),
 ('rã', 2),
 ('pasteñ', 2),
 ('india)', 2),
 ('(forthcoming)', 2),
 ('world)', 2),
 ('row)', 2),
 ('(st', 2),
 ('¥these', 2),
 ('¥wo', 2),
 ('/i', 2),
 ('(roman', 2),
 ('(children', 2),
 ('(in-', 2),
 ('¥it', 2),
 ('(year', 2),
 ('ñone-teacher', 2),
 ('(causes', 2),
 ('(florence', 2),
 ('n¥', 2),
 ('(ex-', 2),
 ('-=', 2),
 ('latin)', 2),
 ('educator)', 2),
 ('(oregon)', 2),
 ('(puts', 2),
 ('¥conference', 2),
 ('¥an', 2),
 ('play)', 2),
 ('dayñ', 2),
 ('ñat', 2),
 ('ff-_-', 2),
 ('a_', 2),
 (')i', 2),
 ('i_', 2),
 ('she)', 2),
 ('(advice', 2),
 ('drills)', 2),
 ('****************************', 2),
 ('¥¥¥¥¥¥¥=', 2),
 ('ñwilliam', 2),
 ('(dwarf)', 2),
 ('ñpresident', 2),
 ('face)', 2),
 ('(d', 2),
 ('(tell', 2),
 ('p)', 2),
 ('ñwho', 2),
 ('(never', 2),
 ('(such', 2),
 ('press)', 2),
 ('e%', 2),
 ('ñ-', 2),
 ('volumes)', 2),
 ('bad)', 2),
 ('(monthly-', 2),
 ('e)', 2),
 ('ne<', 2),
 ('ours]', 2),
 ('(script)', 2),
 ('does)', 2),
 ('(re-', 2),
 ('[italics', 2),
 ('/=', 2),
 ('(where', 2),
 ('(feeling', 2),
 ('(cocoa', 2),
 ('(an', 2),
 ('for¥', 2),
 ('******************', 2),
 ('ñ¥', 2),
 ('}', 2),
 ('(primary)', 2),
 ('mother)', 2),
 ('(superintendent', 2),
 ('ñbegin', 2),
 ('(born', 2),
 ('[note', 2),
 ('at*', 2),
 ('war)', 2),
 ('¥if', 2),
 ('¥like', 2),
 ('rule)', 2),
 ('(giving', 2),
 ('frontispieceñ', 2),
 ('¥are', 2),
 ('wound)', 2),
 ('(they', 2),
 ('(other', 2),
 ('#*', 2),
 ('ñaside', 2),
 ('bed)', 2),
 ('states)', 2),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥', 2),
 ('called)', 2),
 ('mothers)', 2),
 ('tree)', 2),
 ('(seventh', 2),
 ('and_', 2),
 ('¥for', 2),
 ('--=-', 2),
 ('[teacher', 2),
 ('_t', 2),
 ('¥there', 2),
 ('¥j', 2),
 ('union)', 2),
 ('<a', 2),
 ('---¥', 2),
 ('kind)', 2),
 ('[primary', 2),
 ('illustrations)', 2),
 ('the_', 2),
 ('foot)', 2),
 ('v/', 2),
 ('(isa', 2),
 ('(complete', 2),
 ('(september', 2),
 ('‘', 2),
 ('(about)', 2),
 ('effortñ', 2),
 ('-¥-', 2),
 ('catholic)', 2),
 ('china)', 2),
 ('(teach', 2),
 ('number]', 2),
 ('(secretary', 2),
 ('(august', 2),
 ('ã_', 2),
 ('(paper', 2),
 ('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++', 2),
 ('training)', 2),
 ('courses)', 2),
 ('*********', 2),
 ('(preferably', 2),
 ('=---', 2),
 ('[here', 2),
 ('(tracing', 2),
 ('sewingñno', 2),
 ('workñ', 2),
 ('(represented', 2),
 (')ó', 2),
 ('butñ', 2),
 ('effect)', 2),
 ('threeñ', 2),
 ('[previous', 2),
 ('=mo', 2),
 ('only)', 2),
 ('m•r', 2),
 ('testimonies)', 2),
 ('(tenn', 2),
 ('field)', 2),
 ('¥mp', 2),
 ('++++++++++++++++++++++++++++++++++++++', 2),
 ('solution)', 2),
 ('(although', 2),
 ('++++++++++++++++++++++++++++++++++++++++++++++++++', 2),
 ('¥nip', 2),
 ('(r', 2),
 ('agriculture)', 2),
 ('reader)', 2),
 ('storiesñ', 2),
 ('greens)', 2),
 ('(equal', 2),
 ('(physical', 2),
 ('ñyou', 2),
 ('so¥', 2),
 ('(why', 2),
 ('s¥', 2),
 ('¥-¥¥¥¥¥-¥', 2),
 ('(maine)', 2),
 ('observations)', 2),
 ('modeling)', 2),
 ('\\-\\', 2),
 ('n+', 2),
 ('ñraise', 2),
 ('/cu', 2),
 ('style)', 2),
 ('unity)', 2),
 ('ñtake', 2),
 ('questions)', 2),
 ('\\v', 2),
 ('(boy', 2),
 ('[not', 2),
 ('[some', 2),
 ('(over)', 2),
 ('a\\', 2),
 ('ñfor', 2),
 ('¥i¥', 2),
 ('(answers', 2),
 ('¥marks', 2),
 ('before)', 2),
 ('=w', 2),
 ('(h)', 2),
 ('semester)', 2),
 ('(follow', 2),
 ('(mark', 2),
 ('boiled)', 2),
 ('t)', 2),
 ('many)', 2),
 ('(served', 2),
 ('x=', 2),
 ('(fifteen', 2),
 ('+++++++++++++++++', 2),
 ('ñis', 2),
 ('(par', 2),
 ('(resume', 2),
 (')(', 2),
 ('/\\', 2),
 ('(explain)', 2),
 ('(teacher', 2),
 ('inches)', 2),
 ('pattern)', 2),
 ('(dry', 2),
 ('(heathen)', 2),
 ('(rom', 2),
 ('-=-', 2),
 ('types)', 2),
 ('(n', 2),
 ('pupil)', 2),
 ('right)', 2),
 ('\\ve', 2),
 ('><', 2),
 ('••••••', 2),
 ('--=', 2),
 ('(liverpool)', 2),
 ('(three', 2),
 ('our¥', 2),
 ('pine)', 2),
 ('with¥', 2),
 ('f)', 2),
 ('///', 2),
 ('-ñ', 2),
 ('[with', 2),
 ('ñr', 2),
 ('¥¥¥¥¥¥¥¥¥¥¥¥', 2),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥', 2),
 ('measure)', 2),
 ('(academy)', 2),
 ('[during', 2),
 ('ñi', 2),
 ('academy)', 2),
 ('_i', 2),
 ('trainingñno', 2),
 ('m•', 2),
 ('ñ+', 2),
 ('\\umber', 2),
 ('(ro)', 2),
 ('(price', 2),
 ('organ)', 2),
 ('addressingñ', 2),
 ('+++++++++++++++++++++++++++++++++++++', 2),
 ('ñcan', 2),
 ('washington)', 2),
 ('(fourth', 2),
 ('(just', 2),
 ('symposium)', 2),
 ('ñdo', 2),
 ('(omitting', 2),
 ('minute)', 2),
 ('(memory', 2),
 ('(card', 2),
 ('i+', 2),
 ('unit)', 2),
 ('level)', 2),
 ('grasses)', 2),
 ('+*', 2),
 ('say)', 2),
 ('¥ã', 2),
 ('language)', 2),
 ('possible)', 2),
 ('class)', 2),
 ('[will', 2),
 ('¥mo', 2),
 ('[under', 2),
 ('pupils]', 2),
 ("('", 2),
 ('(what', 2),
 ('-)', 2),
 ('ñare', 2),
 ('`i', 2),
 ('o¡', 2),
 ('(write', 2),
 ('term)', 2),
 ('μ', 2),
 ('ñhe', 2),
 ('tongue)', 2),
 ('¥¥¥¥¥=', 2),
 ('ññ', 2),
 ('(k)', 2),
 ('*the', 2),
 ('(luring', 2),
 ('++++++++++++++', 2),
 ('(reproduction', 2),
 ('„', 2),
 ('(yale)', 2),
 ('+)', 2),
 ('(hold', 2),
 ("(snellen's", 2),
 ('(shakes', 2),
 ('%e', 2),
 ('defects)', 2),
 ('===', 2),
 ('(isaiah', 2),
 ('•=', 2),
 ('this)', 2),
 ('(mo', 2),
 ('*-', 2),
 ('[t]', 2),
 ('(primary', 2),
 ('(cf', 2),
 ('¥that', 2),
 ('(adapted', 2),
 ('•••••••', 2),
 ('[let', 2),
 ('efficiencyñin', 2),
 ('ñparable', 2),
 ('****************', 2),
 ("(a')", 2),
 ('is)', 2),
 ('continued)', 2),
 ('(picture', 2),
 ('geography)', 1),
 ('leys)', 1),
 ('¥*ô', 1),
 ('¥¥¥¥¥¥¥+¥¥¥¥¥¥¥¥¥¥¥¥¥¥', 1),
 ('¡f', 1),
 ('cap)', 1),
 ('hour)', 1),
 ('(suggestions', 1),
 ('¥***', 1),
 (')b', 1),
 ('*gel', 1),
 ('(october', 1),
 ('(ourse', 1),
 ('calm)', 1),
 ('met¥', 1),
 ('to%', 1),
 ('*hat', 1),
 ('(z)', 1),
 ('e¥¥', 1),
 ('ñnatural', 1),
 ('homeña', 1),
 ('rificww*vin', 1),
 ('(so-called)', 1),
 ('(mental', 1),
 ('explain)', 1),
 ('(around', 1),
 ('gymnastics)', 1),
 ('[geology', 1),
 ('(n-', 1),
 ('trainingñ', 1),
 ('**************************', 1),
 ('ttñit', 1),
 ('eral]', 1),
 ('¥make', 1),
 ("(jairus's", 1),
 ('•m', 1),
 ('(jan', 1),
 ('may)', 1),
 ('%r', 1),
 ('(ii-', 1),
 ('kr¥¥¥', 1),
 ('-i_-', 1),
 ('a+', 1),
 ('requires)', 1),
 ('**************************¥', 1),
 ('¥them', 1),
 ('ñfaerie', 1),
 ('i()', 1),
 ('al/g', 1),
 ('excitedly)', 1),
 ('>***', 1),
 ('¥<><>', 1),
 ('hand¥', 1),
 ('¥¥¥¨¨', 1),
 ('•••s', 1),
 (')grade', 1),
 ('tennysonñin', 1),
 ('flood)', 1),
 ('writing)', 1),
 ('roomsçasamt', 1),
 ('n¡`', 1),
 ('tionalist)', 1),
 ('feed*)', 1),
 ('[then', 1),
 ('unityñone', 1),
 ('(providing', 1),
 ('down)', 1),
 ('j¥te¥', 1),
 ('(reserve', 1),
 ('question=', 1),
 ('ii¥i', 1),
 ('vermont]', 1),
 ('`+', 1),
 ('attl)e', 1),
 ('astray)', 1),
 ('necessa/ry', 1),
 ('(illustrations', 1),
 ('jun/or', 1),
 ('¥¥+¥¥*a', 1),
 ('managers)', 1),
 ('(advancing', 1),
 ('(shoul-', 1),
 ('})', 1),
 ('horse)', 1),
 ('>wcvm', 1),
 (')**', 1),
 ('+(', 1),
 ('-m=', 1),
 ('t++++++++++++++++++++++++++++++++++++++++++++', 1),
 ('did)', 1),
 ('re_', 1),
 ('*>¥¥¥¥', 1),
 ('-r¥-', 1),
 ('e♦+', 1),
 ('bread-and¥butter', 1),
 ('(different', 1),
 ('render¥', 1),
 ('_lane(poem', 1),
 ('ór', 1),
 ('ñselected', 1),
 ('tin)', 1),
 ('dinner)', 1),
 ('_up', 1),
 ('heads)', 1),
 ('¥m`', 1),
 ('(both', 1),
 ('_pa', 1),
 ('¥vii', 1),
 ('xt¥', 1),
 ('twos)', 1),
 ('ñamerican', 1),
 ('nôro', 1),
 ('in_', 1),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥', 1),
 ('(twice', 1),
 ('¥uate', 1),
 ('drates)', 1),
 ('gcypc¤ad', 1),
 ('******¥', 1),
 ('ñcut', 1),
 ('iii[)', 1),
 ('them)', 1),
 ('anut/', 1),
 ("people's)", 1),
 ('(siee', 1),
 ('america]', 1),
 ('behindñthat', 1),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥#', 1),
 ('noon)', 1),
 ('(congrega-', 1),
 ('•••••t', 1),
 ('[w', 1),
 ('(redbird)', 1),
 ('¥sections', 1),
 ('m/o', 1),
 ('is_the', 1),
 ('ever)', 1),
 ('pk)', 1),
 ('h)', 1),
 ('\\vrite', 1),
 ('---_--', 1),
 ('¥//', 1),
 ('mmummnminnummenamilminantimmammumonmitimmonmonolimmumnimme*', 1),
 ('(july', 1),
 ('r¥oxinuomiumn', 1),
 ('(malay)', 1),
 ('><ny', 1),
 ('mart)', 1),
 ('innoci]\\', 1),
 ('two¥thirds', 1),
 ('(agriculture', 1),
 ('ap_o--do', 1),
 ('(stand', 1),
 ('^/', 1),
 ('++++++++++++++++++', 1),
 ('[he]', 1),
 ('yo/', 1),
 ('(matthew', 1),
 ('j*nw', 1),
 ('yñ', 1),
 ...]

No obvious foreign language use.

Correction 1 -- Normalize Characters

In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction1

Average verified rate: 0.9492114539661417

Average of error rates: 0.06023742299794661

Total token count: 1947762

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 4674),
 ('e', 1613),
 ("'", 1550),
 ('m', 1518),
 ('tion', 1328),
 ('d', 1315),
 ('in-', 1147),
 ('re-', 1033),
 ('con-', 1018),
 ('de-', 783),
 ('w', 737),
 ('r', 697),
 ('be-', 692),
 ('x', 681),
 ('t', 609),
 ('ex-', 590),
 ('g', 589),
 ('n', 565),
 ('com-', 543),
 ('ment', 535),
 ('f', 532),
 ('tions', 416),
 ('ers', 406),
 ('chil-', 403),
 ('dren', 394),
 ('en-', 380),
 ('pro-', 367),
 ('stu-', 324),
 ('teach-', 315),
 ('un-', 310),
 ('educa-', 306),
 ('edu-', 299),
 ('pre-', 299),
 ('ac-', 288),
 ('per-', 286),
 ('im-', 285),
 ('ture', 259),
 ('dis-', 257),
 ('col-', 247),
 ('ence', 243),
 ('ad-', 236),
 ('ap-', 234),
 ('al-', 217),
 ('sub-', 202),
 ('an-', 201),
 ('at-', 199),
 ('ful', 197),
 ('ments', 188),
 ('--', 181),
 ('mis-', 174)]

Correction 2 -- Correct line-endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction2

Average verified rate: 0.9807650865741511

Average of error rates: 0.032333675564681724

Total token count: 1907209

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 4650),
 ('e', 1611),
 ("'", 1550),
 ('m', 1512),
 ('d', 1312),
 ('w', 735),
 ('r', 694),
 ('x', 680),
 ('t', 604),
 ('g', 588),
 ('n', 561),
 ('f', 528),
 ('--', 181),
 ('co', 147),
 ('u', 146),
 ('li', 140),
 ('k', 128),
 ('z', 80),
 ('---', 78),
 ('id', 72),
 ('parent-teacher', 62),
 ('pp', 61),
 ('io', 60),
 ('danish-norwegian', 58),
 ('ni', 55),
 ('oo', 55),
 ('-a', 54),
 ('-inch', 53),
 ('half-year', 49),
 ('high-school', 49),
 ('mo', 47),
 ("'tis", 45),
 ('mt', 44),
 ("an'", 43),
 ('tion', 43),
 ('ex', 43),
 ('th', 41),
 ('church-schools', 41),
 ('mm', 40),
 ('prayer-life', 39),
 ('ti', 39),
 ('il', 36),
 ('tle', 35),
 ('soul-winning', 35),
 ('eighth-grade', 35),
 ("hours'", 34),
 ('full-term', 34),
 ("'s", 34),
 ('ly', 33),
 ('ri', 33)]

Correction 3 -- Address extra dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction3

Average verified rate: 0.984465544377934

Average of error rates: 0.026436601642710476

Total token count: 1904798

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[('e', 1640),
 ("'", 1577),
 ('m', 1540),
 ('d', 1315),
 ('w', 741),
 ('r', 722),
 ('x', 683),
 ('t', 624),
 ('g', 592),
 ('n', 579),
 ('f', 551),
 ('co', 173),
 ('u', 148),
 ('li', 140),
 ('k', 131),
 ('z', 84),
 ('id', 73),
 ('io', 62),
 ('pp', 61),
 ('ni', 56),
 ('oo', 55),
 ('-', 52),
 ('ex', 52),
 ('re', 50),
 ('mo', 49),
 ("'tis", 45),
 ('mt', 44),
 ('tion', 44),
 ('ti', 43),
 ("an'", 43),
 ('th', 42),
 ('mm', 40),
 ('high-school', 40),
 ('prayer-life', 39),
 ('parent-teacher', 37),
 ('il', 37),
 ('tle', 37),
 ('al', 36),
 ('ly', 35),
 ('ri', 35),
 ("'s", 34),
 ("hours'", 34),
 ('ne', 34),
 ('q', 32),
 ('soul-winning', 32),
 ('em', 30),
 ('sq', 28),
 ('ev', 28),
 ('--', 27),
 ('danish-norwegian', 26)]

Correction 4 -- Remove extra quotation marks

In [22]:
# %load shared_elements/remove_extra_quotation_marks.py
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction4

Average verified rate: 0.9847679386475626

Average of error rates: 0.025965605749486656

Total token count: 1904798

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[('e', 1643),
 ('m', 1540),
 ("'", 1534),
 ('d', 1315),
 ('w', 743),
 ('r', 723),
 ('x', 684),
 ('t', 628),
 ('g', 593),
 ('n', 582),
 ('f', 552),
 ('co', 173),
 ('u', 148),
 ('li', 140),
 ('k', 132),
 ('z', 84),
 ('id', 73),
 ('io', 62),
 ('pp', 61),
 ('ni', 56),
 ('oo', 55),
 ('ex', 53),
 ('-', 52),
 ('re', 50),
 ('mo', 49),
 ('ry', 45),
 ('mt', 44),
 ('tion', 44),
 ('ti', 43),
 ("an'", 43),
 ('th', 42),
 ('mm', 40),
 ('high-school', 40),
 ('prayer-life', 39),
 ('parent-teacher', 37),
 ('il', 37),
 ('tle', 37),
 ('al', 36),
 ('ly', 35),
 ('ri', 35),
 ("hours'", 34),
 ('ne', 34),
 ('em', 34),
 ('q', 32),
 ('soul-winning', 32),
 ('sq', 28),
 ('ev', 28),
 ('--', 27),
 ('danish-norwegian', 26),
 ('tne', 26)]

Correction 5 -- Rejoin Burst Words

In [25]:
# %load shared_elements/rejoin_burst_words.py
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction5

Average verified rate: 0.984769974642874

Average of error rates: 0.025962782340862425

Total token count: 1904790

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[('e', 1642),
 ('m', 1540),
 ("'", 1534),
 ('d', 1313),
 ('w', 743),
 ('r', 720),
 ('x', 684),
 ('t', 625),
 ('g', 592),
 ('n', 581),
 ('f', 551),
 ('co', 173),
 ('u', 147),
 ('li', 140),
 ('k', 132),
 ('z', 84),
 ('id', 73),
 ('io', 62),
 ('pp', 61),
 ('ni', 56),
 ('oo', 55),
 ('ex', 53),
 ('-', 52),
 ('re', 50),
 ('mo', 49),
 ('ry', 45),
 ('mt', 44),
 ('tion', 44),
 ('ti', 43),
 ("an'", 43),
 ('th', 42),
 ('mm', 40),
 ('high-school', 40),
 ('prayer-life', 39),
 ('parent-teacher', 37),
 ('il', 37),
 ('tle', 37),
 ('al', 36),
 ('ly', 35),
 ('ri', 35),
 ("hours'", 34),
 ('ne', 34),
 ('em', 34),
 ('q', 32),
 ('soul-winning', 32),
 ('sq', 28),
 ('ev', 28),
 ('--', 27),
 ('danish-norwegian', 26),
 ('tne', 26)]

Correction 6 -- Rejoin Split Words

In [28]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction6

Average verified rate: 0.985078098826811

Average of error rates: 0.02556827515400411

Total token count: 1904382

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[('e', 1627),
 ("'", 1534),
 ('m', 1532),
 ('d', 1311),
 ('w', 742),
 ('r', 717),
 ('x', 682),
 ('t', 620),
 ('g', 589),
 ('n', 575),
 ('f', 551),
 ('u', 147),
 ('co', 142),
 ('k', 132),
 ('li', 122),
 ('z', 84),
 ('id', 73),
 ('pp', 61),
 ('io', 61),
 ('oo', 55),
 ('ni', 54),
 ('-', 52),
 ('ex', 52),
 ('mo', 44),
 ('mt', 44),
 ("an'", 43),
 ('mm', 42),
 ('high-school', 40),
 ('prayer-life', 39),
 ('ry', 39),
 ('th', 38),
 ('ti', 37),
 ('parent-teacher', 37),
 ('il', 37),
 ('tle', 36),
 ('re', 36),
 ("hours'", 34),
 ('tion', 34),
 ('ne', 33),
 ('ri', 33),
 ('em', 33),
 ('soul-winning', 32),
 ('q', 32),
 ('al', 28),
 ('sq', 28),
 ('--', 27),
 ('danish-norwegian', 26),
 ('ginn', 26),
 ('tne', 26),
 ('pa', 25)]

Correction 7 -- Rejoin Split Words II

In [31]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [32]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction7

Average verified rate: 0.9852580420347998

Average of error rates: 0.02533521560574949

Total token count: 1904089

In [33]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[33]:
[('e', 1617),
 ("'", 1534),
 ('m', 1507),
 ('d', 1311),
 ('w', 734),
 ('r', 710),
 ('x', 682),
 ('t', 619),
 ('g', 589),
 ('n', 573),
 ('f', 547),
 ('u', 146),
 ('co', 141),
 ('k', 131),
 ('li', 115),
 ('z', 84),
 ('id', 73),
 ('io', 61),
 ('pp', 61),
 ('ni', 54),
 ('oo', 54),
 ('-', 52),
 ('ex', 52),
 ('mt', 44),
 ('mo', 43),
 ("an'", 43),
 ('mm', 42),
 ('high-school', 40),
 ('prayer-life', 39),
 ('th', 38),
 ('parent-teacher', 37),
 ('il', 37),
 ('ti', 36),
 ("hours'", 34),
 ('tion', 34),
 ('ne', 33),
 ('soul-winning', 32),
 ('q', 32),
 ('ri', 32),
 ('em', 32),
 ('re', 31),
 ('sq', 28),
 ('--', 27),
 ('al', 26),
 ('ry', 26),
 ('danish-norwegian', 26),
 ('ginn', 26),
 ('tne', 26),
 ('ft', 25),
 ('ph', 25)]

Review Remaining Errors

In [34]:
reports.long_errors(errors_summary, min_length=15)
Out[34]:
(['self-introductions',
  'teacher-training',
  'monommemommononommom',
  'large-mindedness',
  'consciencestricken',
  'civic-betterment',
  'foreign-language-speaking',
  'pinmoininiiiiiimmirmiiiimi',
  'college-prepared',
  'physician-system',
  'publishing-house',
  'thought-provoking',
  'viamioimmamvokimmumirmviagat',
  'ipwwwwwwwwwwwmoimmu',
  'uellimaiiiiimipamotogintnru',
  'gilssmoramsmwmmwmeorimmmmtim',
  'nwilmelmninnmnrrimmmmeffimmmmmmmmmon',
  'erezzigggeerrzwzreerizerreereeri',
  'mmmmmmwmmmmmmmmmmiswm',
  'katy-did-she-didshe-did',
  'ovimmiimitilliirymirtnin',
  'self-gratification',
  'cock-cock-cock-cock-a-doodledoo',
  'mmmmmmmimmsmnmnimmmmmm',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii',
  'mmhimmmmmmmmmmmnismmmmmmmmmmmmm',
  'peculiar-looking',
  'anti-tuberculosis',
  'mmkimmmessimkimm',
  'vocational-guidance',
  'iimmiiiiiimiiiiiiiiiiiiiiiiiiiffillii',
  'ownoommiwwwiwmlowni',
  'after-conferences',
  'student-teachers',
  'better-disciplined',
  'forty-five-dollar-a-month',
  'pleasant-sounding',
  'progressiveagitation',
  'self-preservation',
  'overorganization',
  'well-illustrated',
  'thirteen-year-old',
  'pilwraillarimipimmillit',
  'fifty-cent-a-week',
  "bachelor's-button",
  'self-explanatory',
  'straightening-up',
  'half-backstitching',
  'stick-to-it-iveness',
  'miesumlimiimuime',
  'asolassammenamnreonm',
  'unvolearlialmusysit',
  'descripconcreteness',
  'mmencemiertmmimimiiviii',
  'college-educated',
  'rameeeeeewiwieneper',
  'iiiiiiiiiiiiiiiii',
  'oneramoosormarianneamagranatiorma',
  'niptionctemioncnimo',
  'straight-grained',
  'multitudinous-eddying',
  'manummeesnormomp',
  'sherwin-williams',
  'five-and-ten-cent',
  'conservationisms',
  'california-nevada',
  'iiminememinemmemmennummenimemenienehmennummerinememen',
  'pandemonium-concert',
  'cmiiimppimpuomii',
  'okiwimmininanomfs',
  'voloommamomeammoommonwrart',
  'foundation-stone',
  'mmurnueuxummututomonminumminmamomoomemmonnuant',
  'quiver-quavering',
  'francis-rockwell',
  'lurloomminnommoirrimmoommonmonmon',
  'inoninnownnepioxim',
  'idwwwmiumwrimmmowiihmmnimmoirwmweivmmium',
  'quarter-million-dollar',
  'mmummnminnummenamilminantimmammumonmitimmonmonolimmumnimme',
  'opening-exercise',
  'memiemeimiemememamiemememessimmemem',
  'vmmummoomissmishotnonnar',
  'self-sufficiency',
  'self-perpetuation',
  'self-destruction',
  'broad-mindedness',
  'ermerarmasurtrararculmnnw',
  'meseraltionsommeminernrnmmearnar',
  'figuresinlargetype',
  'bread-and-butter',
  'supersensitiveness',
  'cause-and-effect',
  'migartminsauxtmetsuitemssi',
  'iiimiiiimiiiiiiii',
  'self-propagating',
  'instructor-primary',
  'under-missionary',
  'denominationally',
  'two-hundred-and-sixty-four-acre',
  'rimmmuummiummemmiesiimiimimiiimiimumilimitioima',
  'mmommozmommmozmmmommmommimommozmom',
  'howahhimehhahhorh',
  'thought-progression',
  'market-gardening',
  'idllilliiidididi',
  'neeeereerrezeeee',
  'great-grandfather',
  'student-canvasser',
  'emergencyconvellical',
  'flesh-distorting',
  'buttonhole-stitch',
  'plommenntmoicnilliiiiii',
  'different-colored',
  'alovosafirminlagain',
  "bachelor'sbutton",
  'non-english-speaking',
  'health-promoting',
  'agtagggtzgtetorg',
  'nsistakimmemestammmkimmmmm',
  'over-ornamentation',
  'enthusiastierily',
  'insiommiminnuommicciiiim',
  'heart-searchings',
  'stiesisonsimasinsissee',
  'tigris-euphrates',
  'tfeeeeeeeeeeeeeeeeeeeeeeeeeeeekt',
  'enommiummummimmmummunimmiummimmminiiiimmummiumummilimomffimummilmommuniimmimmoimmillimmimmmilms',
  'envoy-extraordinary',
  'language-teaching',
  'self-abandonment',
  'otenunmenunonimmonminalumanuannnormanwnontaananonongnmnm',
  'fourteenyear-old',
  'underphraseology',
  'destroyedworship',
  'eeeeeeewaeeeeeeeenetaratasart',
  'iiiiiillllllllll',
  'twentieth-century',
  'november-december',
  'mononenenmonownonommrsommonmonononomonim',
  'differenthheaedasd',
  'cooking-and-sewingroom',
  'cimmuminimiimscmiimommumcivirlatulicnimmicimo',
  'anewevangelicalmovementis',
  'different-shaped',
  'second-mentioned',
  'characteriscially',
  'narrowshouldered',
  'mlimimilmilimintimitimuilimmumitinitoml',
  'woonoommomutumout',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimiiiiiiiiiiiii',
  'imianimascrxematroottnummr',
  'whole-heartedness',
  'expressinteresting',
  'much-appreciated',
  'nine-years-for-eight-grades',
  'two-and-a-half-inch',
  'strongest-looking',
  'mperumougmwoesmaimmolmoo',
  'mmomirmarawarzogrammummum',
  'after-opportunity',
  'help-one-another',
  'buttonholestitches',
  'sacriinteresting',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiie',
  'self-cultivation',
  'melarimmeirjillimmemmmmkmmmmwammm',
  'better-perfected',
  'composition-rhetoric',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii',
  'rvonniasommaannounemno',
  'self-satisfaction',
  'mommmommmomiommommommommcommomx',
  'student-teaching',
  'sub-consciousness',
  'somprinaimmommomp',
  'blood-corpuscles',
  'auto-intoxication',
  'deductive-inductive',
  'hypotheticremain',
  'manual-training-room',
  'four-by-six-inch',
  'teaching-process',
  'tioilliuniimpiiria',
  'experimentations',
  'health-destroying',
  'moommicamminionnimik',
  'tormamaimanalimeimrammalmwormellisolarm',
  'character-begetting',
  'commander-in-chief',
  'emmmmmmmmmmmmmmksw',
  'twenty-centa-week',
  'self-restoration',
  'conventionalists',
  'mommmmooommommommomm',
  "dutchman's-breeches",
  'iiiiiiiiiiiiiiiiiii',
  'minunturvmmmumtmummmemacmumuuricammwmpuumitudanruniggponmianntmuac',
  'soul-transfusing',
  "mmmmmmmonmmmm'ar",
  'limulannimilimcnimil',
  'nommunnenunnunnuon',
  'memememssemiememi',
  'missomastososawassowslowassess',
  'self-entertainment',
  'mwmmimmmwimmiimotimmmummimm',
  'uomommommoommomx',
  'swaddling-clothes',
  'teacher-evangelists',
  'per-cent-equipment',
  'mogonommoommoommommomomm',
  'arrnapanannammenfamme',
  'fammiemiumemmemium',
  'under-emphasized',
  'selfentertainment',
  'ministerplenipotentiary',
  'ilinesheavilymarked',
  'suassaimvamaussfamiallaaminimponnsiiimilelm',
  'theme-correcting',
  'paint-and-powder',
  'mmmommommommonoznrmommommomonomm',
  'self-examination',
  'peerlessglobefree',
  'mmwmniommnammmnsnm',
  'fortyfive-minute',
  'forty-five-minute',
  'ilommiiiiiiiimiiiiiiimiliiilimillliliiffilillffill',
  'immeminsimmiiiim',
  'anotherisuperintendent',
  'ultraconservatism',
  'mirinummiwipshim',
  'self-forgetfulness',
  'orator-statesman',
  'mommaolowmaselty',
  'eighteen-year-old',
  'mmovroasmonsomamer',
  'imoliiilmommonsatium',
  'rsaparsmovimarmasrarzumva',
  'wilson-carmichael',
  'two-hundred-sixty-fouracre',
  'mmmmmkimnimmmmmmmmmkimmkl',
  'twenty-five-cent',
  'cross-references',
  'stumbling-blocks',
  'equippecdoinmercial',
  'sixtyminute-period',
  'heaven-appointed',
  'boarding-schools',
  '--sehlerermarher',
  'oestovelferolsoard',
  "mckibnbiant'usrebisbelreieis",
  'seventeen-year-old',
  'oluemanimonommumunounmouoommommumeonemanumnatmommamenutaxeounew',
  'nsnnmritartlaite',
  'gnramozgrazamonommmgramramommonommrm',
  'christianeducatiion',
  'practice-teaching',
  'spanish-speaking',
  'pleasure-seeking',
  'cabbage-butterfly',
  'eiiimiiimiesimighirimmemmmmiiimiumnim',
  'subcommissioners',
  'itietheonlydictionary',
  'numnommmulanpnutonolunimmunommonnimoolnampummammlion',
  'mmummuminmaminin',
  'xxxxxxiccxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
  'emamiesimemmimmesissemiem',
  'butter-producing',
  'half-hibernating',
  'intermediate-school',
  'fifteen-year-old',
  'thirty-two-hundredth',
  'several-hundred-pound',
  'oeveionmemeortortemorsingto',
  'immimiiiiiiiiiiiiim',
  'isimmmrommimmmmmmmmei',
  'irurrrrrnttirrirrrrrrrrnriiiiii',
  "superintendents'",
  'milpepociritiort',
  'viiiimommiiimion',
  'ttttttttttttttttt',
  'booksandmagazines',
  'jack-in-thepulpit',
  'niumsmosnamnrumoutmannuaummmuuranymnommilmamimazummere',
  'theadventistseminaryofthe',
  'preconthemselves',
  'tiotamexamaxersaisocantmowamatammactaumeamar',
  'krzwargeeraggergermozwirwzreragrdzen',
  'recitation-rooms',
  'standard-trained',
  'ililllllllllllllll',
  'neldichannerrays',
  'ertrarsargartmemseebseeemeeemerai',
  'mommommommommmoommommimomm',
  'question-and-answer',
  'shouldfillrntthcntirkoverthetell',
  'unttritommuntfjl',
  'ninetyfive-per-cent',
  "mother-teacher's",
  'wirmaemssummtamnomoossonow',
  'secretary-treasurer',
  'literature-making',
  'under-missionaries',
  'well-disciplined',
  'stoop-shouldered',
  'three-andone-fourth-inch',
  'geography-history',
  'born-and-not-made',
  'matturamarsirmartarstweenneeheramarastrk',
  'christian-education',
  'january-february',
  'mmwmmnfumnsloismnmmase',
  'president-emeritus',
  'one-hundred-per-cent',
  'oimmommicommmmoromm',
  'foreign-language',
  'celebrationelaborate',
  'mionimiimmonmand',
  'mmmmmmmmmmmmmkimmmmmmmmmmmmmmmmm',
  'murscumoulmanimurocuniinniirmo',
  'ttttttttttttttttttttttttttt',
  'general-information',
  'thought-sequence',
  'danish-norwegian',
  'analytic-synthetic',
  'piiiiiiiiiiiiiiiiiiiiiii',
  'governbr-general',
  'teacher-candidates',
  'neeerarezereezzeozoorergenszoooroz',
  'nmennwnendwomemen',
  'foundation-stones',
  'fourteenth-grade',
  'amtlieltmonionlitimiciimmulorunumoiimmiummailoilmonmumiltimamponiatnntimcalummoomminoinimitinomprnmaniumn',
  'inliniallitinlikmmun',
  'faculty--thorough',
  'theonlydictionary',
  'wattanmsifammimmfamtantimpaesimm',
  'perfunctionaries',
  'inductivedeductive',
  'leather-upholstered',
  'nilliirrlirlinlinlinlinillii',
  'well-thought-out',
  'rairmeemieekeemeemeemeemeeeeesyn',
  'twenty-cent-aweek',
  'grormanraympanyortrareammpannyrtram',
  'undecidedwhatlineofwork',
  'sauragmuumoturawar',
  'xxxxxxxxxxxxxxxxxxxxxx',
  'illllllllllllllplllllilllllllllllllllllllll',
  'medical-evangelist',
  'sixteenth-century',
  'miumnitommumwelswomisomomoinomemommt',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii',
  'reimiiiminimiellillik',
  'ipiiiiiiiiiillll',
  'miiiimindibuinidimmo',
  'bloodless-lipped',
  'theonlydictionarywith',
  "three-year-old's",
  'demonstration-room',
  'already-accumulated',
  'ecirmormarimmeramogralemenieemieeee',
  'different-looking',
  'composition-writing',
  'interrogation-point',
  'whyitwasthathewassofondof',
  'missionary-explorer',
  'wrifixtemmommomeme',
  'thegeneralconferencemissionarytraining',
  'isweemeeeeeeerameektet',
  'literary-centered',
  'danish--norwegian',
  'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
  'mmmmwmmwmgrcammewiwimn-amirm',
  'characterdeveloping',
  'neeeniviseeisissississeissimaismism',
  'resemiemememememimem',
  'self-development',
  'eaitatraltenitnzsnnmaitoftfazmrifnsklag',
  'foreign-speaking',
  'amusement-loving',
  'assyrio-babylonian',
  'greatgrandmothers',
  'after-cultivation',
  'nrnomimagnmonnimittmultut',
  'iliiiiiiiiiiiiiiiiiiiiiiiiiiii',
  "industrial'instruction",
  'emeeeeeeeeemeeeeeeeeeeeeeeeeerm',
  'winmenninniwoimp',
  'anewevangelicalmovementisthegreatestneedof',
  'nurnovermenoiseam',
  "papers'apecified",
  'prayerand-praise',
  "admiristration's",
  'long-established',
  'candlestick-makers',
  'rirtilipailipinnliv',
  'xxxxxxmcxxxxxxxxxxxxxxxxxxxxxo',
  'monimmimimmonominiii',
  'mmiiiiiiiiiiiiiiiiiimiiiiiiiiiiiiiiiii',
  'whileconstructing',
  'september-october',
  'physical-culture',
  'mmmmmmmmmmmmmeimmmmmmmmmmmmmemmm',
  'three-and-one-half-inch',
  'four-and-onefourth-inch',
  'miincmitinintiovuuluagmnumitamr',
  'mmminsrummmmonmemmonwiemnmski',
  'great-grandmothers'],
 15)

Correction 8 -- Remove long error tokens

In [43]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = "correction7"
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    sub_list = ["i|I", "w|W", "m|M", "e|E", "l|L", "x|X", "r|R", "t|T"]
    
    replacements = []
    for sub in sub_list:
        replacements.append(clean.check_for_repeating_characters(tokens, sub))
    
    replacements = [item for sublist in replacements for item in sublist]
            
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
CE19110401-V02-04-page3.txt: [('XXXXXXMCXXXXXXXXXXXXXXXXXXXXXO', ' '), ('XXXXXXXXXXXXXXXXXXXXXX', ' ')]
CE19110601-V02-05-page3.txt: [('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', ' '), ('XXXXXXICCXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', ' ')]
CE19110801-V02-06-page50.txt: [('riMMMUUMMIUMMEMMIESIIMIIMIMIIIMIIMUMIliMitiOIMA', ' '), ('riMMMUUMMIUMMEMMIESIIMIIMIMIIIMIIMUMIliMitiOIMA', ' ')]
CE19110801-V02-06-page52.txt: [('EIIIMIIIMIESIMigHiRIMMEMMMMIIIMIUMNIM', ' ')]
CE19120401-V03-04-page3.txt: [('MOMMMMOOOMMOMMOMMOMM', ' '), ('MOMMMOMMMOMIOMMOMMOMMOMMCOMMOMX', ' ')]
CE19120601-V03-05-page4.txt: [('mmmommommommonoznrmommommomonomm', ' '), ('gnramozgrazamonommmgramramommonommrm', ' '), ('uomommommoommomx', ' ')]
CE19121001-V04-01-page4.txt: [('IrurrrrrnttirrirrrrrrrrnriIIIII', ' ')]
CE19121001-V04-01-page60.txt: [('IIIIIIIIIIIIIIIII', ' '), ('IMMIMIIIIIIIIIIIIIM', ' ')]
CE19121001-V04-01-page64.txt: [('MOMMOMMOMMMOMM', ' ')]
CE19121001-V04-01-page67.txt: [('numnommmulanpnutonolunimmunommonnimoolnampummammlion.', ' '), ('..cummunimmil....m.nonmumn...lurloomminnommoirrimmoommonmonmon.t...', ' ')]
CE19121001-V04-01-page69.txt: [('NEEEEREERREZEEEE', ' ')]
CE19121001-V04-01-page70.txt: [('OIMMOMMICOMMMMOROMM', ' '), ('MMOMMOZMOMMMOZMMMOMMMOMMIMOMMOZMOM', ' ')]
CE19121201-V04-03-page2.txt: [('MOMMOMMOMMOMMMOOMMOMMIMOMM', ' ')]
CE19121201-V04-03-page36.txt: [('IIIIIIIIIIIIIII', ' '), ('PIIIIIIIIIIIIIIIIIIIIIII', ' '), ('fillitn...anrpleryllm.OVIMMIIMITilliiryMirtnin.', ' ')]
CE19130101-V04-04-page2.txt: [('..onn........niumsmosnamnrumoutmannuaummmuuranymnommilmamimazummere', ' '), ('mmomirmarawarzogrammummum', ' ')]
CE19130101-V04-04-page37.txt: [('iMOLiiilMommonsatium............m.amtliEltmonionlitimiciimmulorunumoiimmiummailoilmonmumiltimamponiatnntimcalummoommInoinimitinomprnmaniumn.imiziiintmu.alovosafirminlagain......MmuSsmnrt.', ' '), ('iMOLiiilMommonsatium............m.amtliEltmonionlitimiciimmulorunumoiimmiummailoilmonmumiltimamponiatnntimcalummoommInoinimitinomprnmaniumn.imiziiintmu.alovosafirminlagain......MmuSsmnrt.', ' ')]
CE19130101-V04-04-page38.txt: [('IIIPICIMIIIIII.M', ' '), ('Oluemanimonommumunounmouoommommumeonemanumnatmommamenutaxeounew', ' ')]
CE19130101-V04-04-page41.txt: [('IIIIIIIIIIIIIIIIIII', ' '), ('Illllllllllllllplllllilllllllllllllllllllll', ' '), ('Illilllllllllll', ' '), ('IIIIIillllllllll', ' '), ('Ililllllllllllllll', ' ')]
CE19130101-V04-04-page42.txt: [('IIMMIIIIIIMIIIIIIIIIIIIIIIIIIiffillii', ' ')]
CE19130101-V04-04-page43.txt: [('MMIIIIIIIIIIIIIIIIIIMIIIIIIIIIIIIIIIII', ' '), ('IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIE', ' ')]
CE19130201-V04-05-page2.txt: [('Imianimascrxematroottnummr.murscumoulmanimurocuniinniirmo...................miincmitinintiovuuLuagmnumitamr', ' '), ('minunturvmmmumtmummmemacmumuuricammwmpuumituDanruniggponmianntmuac', ' ')]
CE19130501-V04-08-page2.txt: [('MOIIMON.CIMMUMINIMIIMSCMIIMOMMUMCIVIrlatuliCNImmICIMO', ' '), ('MOIIMON.CIMMUMINIMIIMSCMIIMOMMUMCIVIrlatuliCNImmICIMO', ' ')]
CE19130601-V04-09-page2.txt: [('IIIMIIIIMIIIIIIII', ' ')]
CE19130601-V04-09-page36.txt: [('PINMOININIIIIIIMMIRMIIIIMI.', ' ')]
CE19130701-V04-10-page12.txt: [('voloommamomeammoommonwrart.', ' ')]
CE19130701-V04-10-page3.txt: [('ErtrarsargartMEMSEEBSEEEMEEEMErai', ' ')]
CE19130701-V04-10-page45.txt: [('MMMMMMMMMMMMMM', ' ')]
CE19130701-V04-10-page7.txt: [('ILIIIIIIIIIIIIIIIIIIIIIIIIIIII', ' ')]
CE19130901-V05-01-page3.txt: [('mmurnueuxummututomonminumminmamomoomemmonnuant........nonnammm.....mmummnminnummenamilminantimmammumonmitimmonmonolimmumnimmE', ' ')]
CE19131101-V05-03-page3.txt: [('EREZZigggEERRZWZREERIZERREEREERI', ' ')]
CE19131201-V05-04-page3.txt: [('MMMMMMMiMMSMNMNiMMMMMM', ' '), ('MMMMMKIMNiMMMMMMMMMKIMMKl', ' ')]
CE19140101-V05-05-page3.txt: [('WATTANMSIFAMMIMMFAMTANTIMPAESIMM', ' ')]
CE19140201-V05-06-page35.txt: [('EENNEEEEEEEEE', ' ')]
CE19140301-V05-07-page2.txt: [('mmwmmnFumnsloismnmmasE', ' '), ('EmmmmmmmmmmmmmmKsw', ' ')]
CE19140301-V05-07-page34.txt: [("MMMMMMMONMMMM'ar", ' '), ('MElarIMMEIRJILLIMMEMMMMKMMMMWAMMM', ' ')]
CE19140401-V05-08-page3.txt: [('ISIMMMROMMIMMMMMMMMEI', ' '), ('NOMMKIMMEMMMM', ' ')]
CE19140501-V05-09-page2.txt: [('.Nc.m.martn.N.R.R..mmammammE', ' '), ('EMEEEEEEEEEMEEEEEEEEEEEEEEEEErm', ' ')]
CE19140501-V05-09-page3.txt: [('nSISTAKIMMEMESTAMMMKIMMMMM', ' ')]
CE19140801-V05-10-page2.txt: [('mmwmniommnammmnsnm', ' '), ('MMMMMMMMMMMMMEIMMMMMMMMMMMMMEMMM', ' ')]
CE19140801-V05-10-page31.txt: [('MMHiMmmMMMMMMmmnismMmMmmmMMMMMM', ' '), ('MMMMMMMMMMMMMKIMMMMMMMMMMMMMMMMM', ' ')]
CE19140801-V05-10-page34.txt: [('rairMEEMIEEKEEMEEMEEMEEMEEEEESYN', ' ')]
CE19141001-V06-02-page3.txt: [('MMKIMMMESSIMKIMM', ' '), ('nWiLMELMNinNMnrrIMMMMEffiMMMMMMMMMON', ' ')]
CE19141001-V06-02-page35.txt: [('MMENCEMIERTMMIMIMIIVIII.OKIWIMMININANOMFS', ' ')]
CE19141101-V06-03-page3.txt: [('MWOIMM-IMMEN.O.MMOO.', ' ')]
CE19150301-V06-07-page30.txt: [('eeeeeeeeeeeee', ' ')]
CE19150501-V06-09-page34.txt: [('ttttttttttttttttttttttttttt', ' '), ('ttttttttttttttttt', ' ')]
CE19150901-V07-01-page6.txt: [('EEEEEEEwAEEEEEEEENETaratasart', ' '), ('iswEEmEEEEEEEramEEKtet', ' ')]
CE19151001-V07-02-page3.txt: [('tfEEEEEEEEEEEEEEEEEEEEEEEEEEEEkt', ' ')]
CE19160401-V07-08-page3.txt: [('mmmmwmmwmgrcAmmEwiwimn-Amirm', ' ')]
CE19160601-V07-10-page3.txt: [('.mogonommoommoommommomomm', ' '), ('monommemommononommom', ' ')]
CE19160901-V08-01-page3.txt: [('mmmmmmwmmmmmmmmmmIswm', ' ')]
CE19160901-V08-01-page32.txt: [('m......ammasennown.....m.somprinaimmommomp', ' ')]
CE19161001-V08-02-page3.txt: [('mmminsrummmmoNmEmmoNwiEmnmski', ' ')]
CE19161101-V08-03-page3.txt: [('IIIIIIIIIIIIIII', ' '), ('Enommiummummimmmummunimmiummimmminiiiimmummiumummilimomffimummilmommuniimmimmoimmillimmimmmilms', ' ')]
CE19161201-V08-04-page3.txt: [('IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIMIIIIIIIIIIIII', ' '), ('ilommiiiiiiiimiiiiIiimiliiiliMillliliiffilillffill', ' '), ('IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII', ' '), ('IpIIIIIIIIIillll', ' '), ('ilommiiiiiiiimiiiiIiimiliiiliMillliliiffilillffill', ' ')]
CE19170301-V08-07-page2.txt: [('IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII', ' ')]
CE19180101-V09-05-page3.txt: [('IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII', ' ')]
CE19180201-V09-06-page2.txt: [('IIMINEMEMINEMMEMMENNUMMENIMEMENIENEHMENNUMMERINEMEMEN', ' ')]
CE19180901-V10-01-page4.txt: [('IIIIIIIIIIIIIIIII', ' '), ('IPWWWWWWWWWWWMOIMMU', ' '), ('IDWWWMIUMWRIMMMOWIIHMMNIMMOIRWMWEIVMMIUM', ' '), ('WOWMMWMIRM.MWMMIMMMWIMMIIMOTIMMMUMMIMM', ' '), ('IIIMMIOIMMM.IMM', ' ')]
In [44]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction8

Average verified rate: 0.9853232442016993

Average of error rates: 0.025193018480492813

Total token count: 1903963

In [45]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[45]:
[('e', 1617),
 ("'", 1534),
 ('m', 1501),
 ('d', 1311),
 ('w', 734),
 ('r', 708),
 ('x', 682),
 ('t', 618),
 ('g', 589),
 ('n', 572),
 ('f', 547),
 ('u', 146),
 ('co', 141),
 ('k', 131),
 ('li', 115),
 ('z', 84),
 ('id', 73),
 ('io', 61),
 ('pp', 61),
 ('ni', 54),
 ('oo', 54),
 ('-', 52),
 ('ex', 52),
 ('mt', 44),
 ('mo', 43),
 ("an'", 43),
 ('mm', 42),
 ('high-school', 40),
 ('prayer-life', 39),
 ('th', 38),
 ('parent-teacher', 37),
 ('il', 37),
 ('ti', 36),
 ("hours'", 34),
 ('tion', 34),
 ('ne', 33),
 ('soul-winning', 32),
 ('q', 32),
 ('ri', 32),
 ('em', 32),
 ('re', 31),
 ('sq', 28),
 ('--', 27),
 ('al', 26),
 ('ry', 26),
 ('danish-norwegian', 26),
 ('ginn', 26),
 ('tne', 26),
 ('ft', 25),
 ('ph', 25)]

Correction 9 -- Separate Squash Words

In [46]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
CE19100201-V01-03-page1.txt: [('consciencestricken', 'conscience stricken')]
CE19111001-V03-01-page25.txt: [('inductivedeductive', 'inductive deductive')]
CE19120401-V03-04-page43.txt: [('buttonholestitches', 'buttonhole stitches')]
CE19120601-V03-05-page54.txt: [('progressiveagitation', 'progressive agitation')]
CE19121001-V04-01-page66.txt: [('Undecidedwhatlineofwork', 'Undecided what line of work')]
CE19121201-V04-03-page2.txt: [('ANEWEVANGELICALMOVEMENTis', 'A NEW EVANGELICAL MOVEMENT is')]
CE19130401-V04-07-page2.txt: [('asolassammenamnreonm', 'a sol as s am men am n r e o n m')]
CE19131101-V05-03-page2.txt: [('TheGeneralConferenceMissionaryTraining', 'The General Conference Missionary Training')]
CE19131201-V05-04-page2.txt: [('ANEWEVANGELICALMOVEMENTisthegreatestneedof', 'A NEW EVANGELICAL MOVEMENT is the greatest need of')]
CE19141001-V06-02-page9.txt: [('expressinteresting', 'express interesting')]
CE19141101-V06-03-page34.txt: [('theonlydictionarywith', 'the only dictionary with')]
CE19150101-V06-05-page8.txt: [('characterdeveloping', 'character developing')]
CE19150401-V06-08-page34.txt: [('itietheonlydictionary', 'i tie the only dictionary')]
CE19151001-V07-02-page2.txt: [('ilinesheavilymarked', 'i lines heavily marked'), ('figuresinlargetype', 'figures in large type')]
CE19160301-V07-07-page30.txt: [('whyitwasthathewassofondof', 'why it was that he was so fond of')]
CE19170201-V08-06-page19.txt: [('celebrationelaborate', 'celebration elaborate')]
CE19170301-V08-07-page31.txt: [('TheAdventistSeminaryofthe', 'The Adventist Seminary of the')]
CE19200201-V11-06-page30.txt: [('Anotherisuperintendent', 'Another i superintendent')]
In [47]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/CE/correction9

Average verified rate: 0.9853311057762251

Average of error rates: 0.02517864476386037

Total token count: 1904029

In [48]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[48]:
[('e', 1618),
 ("'", 1534),
 ('m', 1502),
 ('d', 1311),
 ('w', 734),
 ('r', 709),
 ('x', 682),
 ('t', 618),
 ('g', 589),
 ('n', 574),
 ('f', 547),
 ('u', 146),
 ('co', 141),
 ('k', 131),
 ('li', 115),
 ('z', 84),
 ('id', 73),
 ('pp', 61),
 ('io', 61),
 ('ni', 54),
 ('oo', 54),
 ('-', 52),
 ('ex', 52),
 ('mt', 44),
 ('mo', 43),
 ("an'", 43),
 ('mm', 42),
 ('high-school', 40),
 ('prayer-life', 39),
 ('th', 38),
 ('parent-teacher', 37),
 ('il', 37),
 ('ti', 36),
 ("hours'", 34),
 ('tion', 34),
 ('ne', 33),
 ('soul-winning', 32),
 ('q', 32),
 ('ri', 32),
 ('em', 32),
 ('re', 31),
 ('sq', 28),
 ('--', 27),
 ('al', 26),
 ('ry', 26),
 ('danish-norwegian', 26),
 ('ginn', 26),
 ('tne', 26),
 ('ft', 25),
 ('ph', 25)]
In [ ]: