GS-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "GS"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/baseline

Average verified rate: 0.9334002205428498

Average of error rates: 0.06745357142857143

Total token count: 1297707

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 100 )
Out[11]:
[('-', 2890),
 ("'", 2441),
 ('¥', 2014),
 ('ñ', 1493),
 (')', 1170),
 ('re-', 816),
 ('tion', 746),
 ('d', 692),
 ('con-', 690),
 ('t', 642),
 ('in-', 504),
 ('be-', 495),
 ('e', 489),
 ('w', 489),
 ('com-', 462),
 ('r', 425),
 ('(', 418),
 ('de-', 405),
 ('ment', 378),
 ('m', 360),
 ('f', 355),
 ('ñthe', 308),
 ('g', 303),
 ('ñno', 288),
 ('n', 279),
 ('ex-', 267),
 ('sab-', 252),
 ('th', 238),
 ("of'", 230),
 ('un-', 225),
 ("'the", 204),
 (']', 202),
 ("the'", 199),
 ('tions', 193),
 ('u', 189),
 ('im-', 186),
 ('dis-', 182),
 ('pro-', 178),
 ('ña', 171),
 ('en-', 171),
 ('ex', 170),
 ('per-', 167),
 ('pre-', 158),
 ('[vol', 156),
 ('ments', 154),
 ('x', 143),
 ('--', 141),
 ('chris-', 140),
 ('an-', 136),
 ('ob-', 135),
 ('command-', 132),
 ('ac-', 131),
 ('ble', 131),
 ('ap-', 129),
 ("'of", 127),
 ('ã', 126),
 ('/', 121),
 ('pp', 118),
 ('ad-', 111),
 ('aro', 110),
 ('semi-monthly', 109),
 ('_', 108),
 ('-the', 107),
 ('first-day', 107)]

Check Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)[:500]
Out[12]:
[('¥', 2014),
 ('ñ', 1493),
 (')', 1170),
 ('(', 418),
 ('ñthe', 308),
 ('ñno', 288),
 (']', 202),
 ('ña', 171),
 ('[vol', 156),
 ('ã', 126),
 ('/', 121),
 ('_', 108),
 ('*', 75),
 ('[the', 72),
 ('(rev', 71),
 ('(see', 67),
 ('ñjohn', 62),
 ('ñparties', 55),
 ('(matt', 51),
 ('ñit', 50),
 ('(the', 49),
 ('ñtent', 48),
 ('ñmeetings', 45),
 ('¥¥', 43),
 ('ñeld', 42),
 ('ñchurch', 42),
 ('ñselected', 40),
 ('`', 40),
 ('ñthat', 39),
 ('(verse', 38),
 ('ñj', 38),
 ('(isa', 34),
 ('(heb', 34),
 ('(gen', 33),
 ('ô', 32),
 ('(rom', 32),
 ('(john', 31),
 ('ñnot', 31),
 ('%', 31),
 ('ñand', 31),
 ('(acts', 30),
 ('(ps', 28),
 ('\\', 26),
 ('ñin', 26),
 ('ñi', 25),
 ('(as', 24),
 ('(a', 24),
 ('(for', 24),
 ('ñelds', 23),
 ('=', 23),
 ('ñbecause', 22),
 ('¥the', 22),
 ('(luke', 22),
 ("¥'", 22),
 ('ñtwo', 21),
 ('(and', 20),
 ('ñat', 20),
 ('ñfour', 20),
 ('ñs', 20),
 ('ñhistory', 19),
 ('(which', 19),
 ('ñyes', 19),
 ('[or', 19),
 ('•', 18),
 ('[von', 18),
 ('ñw', 18),
 ('ñd', 18),
 ('ñan', 18),
 ('ñfive', 18),
 ('ñc', 17),
 ('ñthis', 17),
 ('(i', 17),
 ('ñsix', 17),
 ('ñthey', 17),
 ('£', 17),
 ('ñnew', 16),
 ('ñcertainly', 16),
 ('(mark', 16),
 ('(ex', 16),
 ("'¥", 16),
 ('ñchristian', 16),
 ('(dan', 16),
 ('(chap', 16),
 ('ñdr', 16),
 ('(separate', 15),
 ('ñtent-meetings', 15),
 ('ñh', 15),
 ('volume)', 15),
 ('ñseveral', 15),
 ('ñto', 15),
 ('[christ]', 14),
 ('[that', 14),
 ('ñwe', 14),
 ('ó', 14),
 ('¥¥¥', 14),
 ('ñby', 14),
 ('ñthese', 14),
 ('the¥', 14),
 ('ñone', 13),
 ('-¥', 13),
 ('ñall', 13),
 ('ñwhy', 13),
 ('ñis', 13),
 ('~~', 13),
 ('ñsigns', 13),
 ('ñthree', 13),
 ('(eph', 13),
 ('pamphlets)', 13),
 ('ñjames', 12),
 ('ñg', 12),
 ('ñsel', 12),
 ('—', 12),
 ('ñseven', 12),
 ('ñwhat', 12),
 ('¥and', 12),
 ('ñthose', 12),
 ('ñbut', 12),
 ('ñnine', 12),
 ('ñset', 11),
 ('christñthe', 11),
 ('ñeight', 11),
 ('^', 11),
 ('[sunday]', 11),
 ('ñbarnes', 11),
 ('ñgeneral', 11),
 ('ñfrom', 11),
 ('ñhe', 11),
 ('¡', 11),
 ('ñrev', 11),
 ('ñstate', 10),
 ('(in', 10),
 ('(if', 10),
 ('[in', 10),
 ('(jer', 10),
 ('ñten', 10),
 ('ñl', 10),
 ('of¥', 10),
 ('it)', 10),
 ('ñas', 10),
 ('ñmost', 9),
 ('ñn', 9),
 ('ñidem', 9),
 ('ñtwelve', 9),
 ('bibleñthe', 9),
 ('[vox', 9),
 ('ñe', 9),
 ('ñwhen', 9),
 ('mechanicalñmassage', 9),
 ('ñthere', 9),
 ('ñthen', 9),
 ('[not', 9),
 ('ñif', 9),
 ('in¥', 9),
 ('[a', 9),
 ('(or', 8),
 ('(lay', 8),
 ('ñprof', 8),
 ('ñscott', 8),
 ('ñvol', 8),
 ('¥¥¥¥¥¥', 8),
 ('(not', 8),
 ('(col', 8),
 ('movementsñmanual', 8),
 ('#', 8),
 ('[margin', 8),
 ('(gal', 8),
 ('ñcan', 8),
 ('ñf', 8),
 ('ñour', 8),
 ('andñ', 8),
 ('ñmrs', 8),
 ('day)', 8),
 ('[vot', 8),
 ('(eze', 8),
 ('¥it', 8),
 ('and¥', 8),
 ('[god]', 8),
 ('a¥', 8),
 ('ñt', 7),
 ('ñdialogues', 7),
 ('(that', 7),
 ('margin]', 7),
 ('_the', 7),
 ('¥of', 7),
 ('%%%%%', 7),
 ('ñcamp-meeting', 7),
 ('[vor', 7),
 ('ñchrist', 7),
 ('<', 7),
 ('ñencouraging', 7),
 ('(verses', 7),
 ('is¥', 7),
 ('ñfor', 7),
 ('ñclarke', 7),
 ('{', 7),
 ('ñmission', 7),
 ('[', 7),
 ('ñr', 6),
 ('ñoh', 6),
 ('law)', 6),
 ('thronesñspiritualism', 6),
 ('(margin)', 6),
 ('day]', 6),
 ('christ]', 6),
 ('(zech', 6),
 ('¥i', 6),
 ('ñhave', 6),
 ('godñwhat', 6),
 ('deadñthe', 6),
 ('be¥', 6),
 ('ñspecial', 6),
 ('ñindependent', 6),
 ('ñwm', 6),
 ('ñwith', 6),
 ('godñthe', 6),
 ('ñm', 6),
 ('goapr_', 6),
 ('ñreview', 6),
 ('delusionñsamuel', 6),
 ('(dent', 6),
 ('ñredemptionñthe', 6),
 ('¥¥¥¥¥', 6),
 ('ñsir', 6),
 ('referencesñtithes', 6),
 ('lawsñseven', 6),
 ('(state', 6),
 ('ñfifteen', 6),
 ('¥-', 6),
 ('sabbathñfirst', 6),
 ('(minn', 6),
 ("gospelñgod's", 6),
 ('earth]', 6),
 ('wickedñlost', 6),
 ('i)', 6),
 ('commandmentsñwithout', 6),
 ('meeting)', 6),
 ('¥to', 6),
 ('offeringsñseventh', 6),
 ('¥a', 6),
 ('ñend', 6),
 ('+', 6),
 ('(b', 6),
 ('(to', 6),
 ('ñjust', 6),
 ('abolishedñthe', 6),
 ('(n', 6),
 ('gatedñone', 5),
 ('ñcoming', 5),
 ('r¥', 5),
 ('margin)', 5),
 ('(he', 5),
 ('millenniumñ', 5),
 ('ñonly', 5),
 ('ñfirst', 5),
 ('death]', 5),
 ('ñvery', 5),
 ('[is]', 5),
 ('(plural)', 5),
 ('ñconsiderable', 5),
 ('adventistsñten', 5),
 ('itñ', 5),
 ('ñthirteen', 5),
 ('abolishedñan', 5),
 ('baptistsñsigns', 5),
 ('~', 5),
 ('ñhist', 5),
 ('ñlearned', 5),
 ("[god's]", 5),
 ('ñnothing', 5),
 ('ñof', 5),
 ('la*', 5),
 ('ñsixteen', 5),
 ('¥in', 5),
 ('truthñori-', 5),
 ('sabbathñ', 5),
 ('ñso', 5),
 ('(by', 5),
 ('timeñ', 5),
 ('littleñthe', 5),
 ('[to', 5),
 ('[void', 5),
 ('questionñs', 5),
 ('to¥', 5),
 ('(though', 5),
 ('(ezra', 5),
 ('sabbathñthe', 5),
 ('(catholic)', 5),
 ('[as', 5),
 ('judgmentñ', 5),
 ('ventñthe', 5),
 ('dayñdeparting', 5),
 ('\ufeff', 5),
 ('ñsimply', 5),
 ('michã', 5),
 ('manñseven', 5),
 ('ñeleven', 5),
 ('candidñwhich', 5),
 ('¥is', 5),
 ('ñpage', 5),
 ('time)', 5),
 ('ñpope', 5),
 ('[which', 5),
 ('year)', 5),
 ('fateñthe', 4),
 ('ñon', 4),
 ('(lev', 4),
 ('(lie', 4),
 ('[of', 4),
 ('*the', 4),
 ('¥be', 4),
 ('(compare', 4),
 ('ñsaid', 4),
 ('/c', 4),
 ('¥¥¥¥¥¥¥', 4),
 ('endorñthe', 4),
 ('all)', 4),
 ('sun)', 4),
 ('them]', 4),
 ('re¥', 4),
 ('them)', 4),
 ('o)', 4),
 ('t)', 4),
 ('(eccl', 4),
 ('ñencyclopedia', 4),
 ('excuseñ', 4),
 ('ñsabbath', 4),
 ('be)', 4),
 ('(sunday)', 4),
 ('reasonsñthe', 4),
 ('sunday]', 4),
 ('[and', 4),
 ('ñbible', 4),
 ('(german)', 4),
 ('ñgeo', 4),
 ('ñcourse', 4),
 ('ñmen', 4),
 ('ñnone', 4),
 ('ñnay', 4),
 ('¡reek', 4),
 ('`the', 4),
 ('ñcatholic', 4),
 ('ñshe', 4),
 ('christ)', 4),
 ('¥at', 4),
 ('¥¥¥¥', 4),
 ('(lid', 4),
 ('ñreports', 4),
 ('ñhis', 4),
 ('sabbathñsun-', 4),
 ('ñscripture', 4),
 ('—no', 4),
 ('ñdouay', 4),
 ('ñb', 4),
 ('his¥', 4),
 ('ñcyclopedia', 4),
 ('jews]', 4),
 ('ñwhich', 4),
 ('ñbishop', 4),
 ('ñwas', 4),
 ('testamentñthe', 4),
 ('¥r', 4),
 ('¡thing', 4),
 ('ñhad', 4),
 ('`there', 4),
 ('x)', 4),
 ('ñspurgeon', 4),
 ('lordñsign', 4),
 ('ñmembership', 4),
 ('[i', 4),
 ('years)', 4),
 ('ãthe', 4),
 ('ñhow', 4),
 ('life]', 4),
 ("'ñ", 4),
 ('(like', 4),
 ('ñevidently', 4),
 ('`and', 4),
 ('¥as', 4),
 ('churchesñas', 4),
 ('lordñper-', 4),
 ('prophecyñthe', 4),
 ("ñworkers'", 3),
 ('[greek', 3),
 ('¥we', 3),
 ('ñyou', 3),
 ('(after', 3),
 ('@', 3),
 ('(children', 3),
 ('(i)', 3),
 ('ñst', 3),
 ('church)', 3),
 ('±', 3),
 ('%%%%%%', 3),
 ('ñvictor', 3),
 ('ñsunday', 3),
 ('ñancient', 3),
 ('¥they', 3),
 ('[jesus]', 3),
 ('/i', 3),
 ('[italics', 3),
 ('nearlñis', 3),
 ('(according', 3),
 ('ñmuch', 3),
 ('ñabout', 3),
 ('¥for', 3),
 ("ñandrews's", 3),
 ('¥this', 3),
 ('ñwho', 3),
 ('¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥', 3),
 ('ñdebates', 3),
 ('ñgen', 3),
 ('ñii', 3),
 ('ñart', 3),
 ('ñgreenfield', 3),
 ('ñchicago', 3),
 ('ñpriest', 3),
 ('¥that', 3),
 ('least)', 3),
 ('ñsuch', 3),
 (")'", 3),
 ('ñcom', 3),
 ('ñdoes', 3),
 ('ñsabbath-school', 3),
 ('ñgod', 3),
 ('tion)', 3),
 ('(p', 3),
 ('¥an', 3),
 ('(mal', 3),
 ('¥all', 3),
 ('—it', 3),
 ('%/', 3),
 ('sabbath]', 3),
 ('oneñ', 3),
 ('ñfar', 3),
 ('%%', 3),
 ('[rest]', 3),
 ('church]', 3),
 ('ñdean', 3),
 ('faith]', 3),
 ('(led', 3),
 ('t¥', 3),
 ("ñ'", 3),
 ('are¥', 3),
 ('so)', 3),
 ('¥law', 3),
 ('%%%%%%%%%', 3),
 ('con¥', 3),
 ('ñdoctrinal', 3),
 ('that¥', 3),
 ('(christ', 3),
 ("'/", 3),
 ('ñseries', 3),
 ('¥¥¥¥¥¥¥¥¥', 3),
 ('that]', 3),
 ('the)', 3),
 ('ñrevival', 3),
 ('preparingñflee', 3),
 ('ñmore', 3),
 ('ñdes', 3),
 ('ñgolden', 3),
 ('rome]', 3),
 ('(james', 3),
 ('ñwell', 3),
 ('ñah', 3),
 ('(christ)', 3),
 ('do)', 3),
 ('ñannual', 3),
 ("ñsmith's", 3),
 ('iô', 3),
 ('one)', 3),
 ('baptistsñthe', 3),
 ('father]', 3),
 ('publish=', 3),
 ('ñjesus', 3),
 ('but¥', 3),
 ('[john', 3),
 ('ñmany', 3),
 ('he¥', 3),
 ('[heb', 3),
 ('ñcottage', 3),
 ('ño', 3),
 ('ñare', 3),
 ('(we', 3),
 ('for¥', 3),
 ('ñlabors', 3),
 ('god)', 3),
 ('sin)', 3),
 ('days)', 3),
 ('coming]', 3),
 ('iñ', 3),
 ('ñpresent', 3),
 ('ñdohn', 3),
 ('ñwhether', 3),
 ('ñdia-', 3),
 ('[paul]', 3),
 ('law]', 3),
 ('ñcomments', 3),
 ('ñsome', 3),
 ('*ill', 3),
 ('ñchris-', 3)]

Correction 1 -- Normalize Characters

In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/correction1

Average verified rate: 0.9442336721500114

Average of error rates: 0.056576785714285704

Total token count: 1293128

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 3001),
 ("'", 2553),
 ('re-', 817),
 ('tion', 751),
 ('d', 721),
 ('con-', 691),
 ('t', 690),
 ('e', 516),
 ('w', 515),
 ('in-', 506),
 ('be-', 498),
 ('com-', 462),
 ('r', 455),
 ('de-', 407),
 ('ment', 379),
 ('f', 377),
 ('m', 372),
 ('g', 324),
 ('n', 304),
 ('ex-', 268),
 ('sab-', 253),
 ('th', 241),
 ("of'", 231),
 ('un-', 225),
 ("'the", 204),
 ("the'", 202),
 ('u', 196),
 ('tions', 193),
 ('ex', 187),
 ('im-', 186),
 ('dis-', 182),
 ('pro-', 179),
 ('en-', 171),
 ('per-', 171),
 ('pre-', 158),
 ('ments', 157),
 ('x', 154),
 ('--', 153),
 ('chris-', 144),
 ('ob-', 136),
 ('an-', 136),
 ('command-', 132),
 ('ble', 131),
 ('ac-', 131),
 ('ap-', 129),
 ("'of", 127),
 ('pp', 119),
 ('ad-', 112),
 ('aro', 110),
 ('semi-monthly', 109)]

Correction 2 -- Reconnect Split Line Endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = "correction1"
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/correction2

Average verified rate: 0.966747340132954

Average of error rates: 0.034048214285714286

Total token count: 1272620

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 2992),
 ("'", 2553),
 ('d', 721),
 ('t', 682),
 ('w', 515),
 ('e', 511),
 ('r', 452),
 ('f', 376),
 ('m', 372),
 ('g', 322),
 ('n', 304),
 ('th', 239),
 ("of'", 231),
 ("'the", 204),
 ("the'", 202),
 ('u', 195),
 ('ex', 187),
 ('x', 154),
 ('--', 153),
 ("'of", 127),
 ('pp', 119),
 ('aro', 110),
 ('semi-monthly', 109),
 ('first-day', 108),
 ('-the', 108),
 ("'and", 79),
 ('eze', 75),
 ("and'", 69),
 ("'to", 65),
 ('k', 64),
 ('re', 62),
 ('bo', 59),
 ("to'", 58),
 ('wo', 57),
 ("''", 57),
 ('-no', 56),
 ('tent-meetings', 52),
 ('q', 50),
 ('ti', 49),
 ('sunday-keeping', 48),
 ('tion', 48),
 ('ots', 48),
 ('mo', 45),
 ("in'", 45),
 ('ft', 43),
 ('---', 41),
 ('co', 39),
 ('wm', 39),
 ('rest-day', 38),
 ("'a", 38)]

Correction 3 -- Remove extra dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = "correction2"
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/correction3

Average verified rate: 0.9731340170454946

Average of error rates: 0.027557142857142856

Total token count: 1275293

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[("'", 2637),
 ('d', 728),
 ('t', 698),
 ('e', 525),
 ('w', 519),
 ('r', 469),
 ('f', 384),
 ('m', 376),
 ('g', 331),
 ('n', 311),
 ('th', 244),
 ("of'", 231),
 ("'the", 206),
 ("the'", 202),
 ('u', 197),
 ('ex', 196),
 ('x', 160),
 ('re', 158),
 ("'of", 128),
 ('pp', 119),
 ('aro', 110),
 ('co', 91),
 ("'and", 79),
 ('eze', 75),
 ("and'", 71),
 ('ti', 67),
 ('k', 66),
 ("'to", 65),
 ('bo', 61),
 ("''", 59),
 ("to'", 58),
 ('wo', 57),
 ('q', 53),
 ('tion', 48),
 ('ots', 48),
 ("in'", 46),
 ('mo', 45),
 ('ft', 44),
 ('pre', 41),
 ('wm', 39),
 ("'in", 38),
 ("'a", 38),
 ("'is", 37),
 ('mt', 37),
 ("that'", 36),
 ("'that", 33),
 ('ment', 32),
 ("saints'", 31),
 ('es', 31),
 ("'t", 30)]

Correction 4 -- Remove Extra Quotation Marks

In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = "correction3"
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/correction4

Average verified rate: 0.9758673740635434

Average of error rates: 0.024801785714285717

Total token count: 1275286

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[("'", 2240),
 ('t', 768),
 ('d', 734),
 ('e', 548),
 ('w', 522),
 ('r', 489),
 ('f', 403),
 ('m', 387),
 ('n', 337),
 ('g', 333),
 ('th', 247),
 ("of'", 218),
 ('u', 198),
 ('ex', 197),
 ("the'", 178),
 ('re', 166),
 ('x', 160),
 ('pp', 119),
 ('aro', 111),
 ('co', 92),
 ('eze', 75),
 ('ti', 68),
 ("and'", 66),
 ('k', 66),
 ('bo', 62),
 ('wo', 59),
 ('q', 53),
 ('tion', 50),
 ("to'", 49),
 ('ots', 48),
 ('mo', 46),
 ('ft', 44),
 ('pre', 42),
 ("''", 41),
 ("in'", 40),
 ('wm', 40),
 ('mt', 37),
 ('nd', 36),
 ('al', 32),
 ('se', 32),
 ('ment', 32),
 ('es', 31),
 ('mal', 30),
 ('pa', 30),
 ("saints'", 30),
 ("that'", 29),
 ('goapxl', 27),
 ('ay', 27),
 ('iu', 27),
 ('il', 26)]

Check Correction 5 -- Rejoin Burst Words

In [25]:
# %load shared_elements/rejoin_burst_words.py
prev = "correction4"
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
GS18860201-V01-01-page1.txt: [('By', 'By')]
GS18860415-V01-06-page2.txt: [(' h o r n s ', 'horns')]
GS18860815-V01-14-page7.txt: [('\nF R A N C E ', 'FRANCE')]
GS18861015-V01-18-page6.txt: [('It', 'It')]
GS18861101-V01-19-page4.txt: [('Is', 'Is')]
GS18861208-V01-22-page8.txt: [(' b el o v ed ', 'beloved')]
GS18861222-V01-24-page8.txt: [('Go', 'Go')]
GS18870315-V02-06-page3.txt: [('If', 'If')]
GS18880101-V03-01-page7.txt: [('It', 'It')]
GS18880215-V03-04-page3.txt: [('We', 'We')]
GS18881001-V03-19-page2.txt: [('It', 'It'), ('If', 'If')]
GS18881101-V03-21-page1.txt: [('\nE D IT O R IA L ', 'EDITORIAL'), (' C O M M IT T E E ', 'COMMITTEE')]
GS18881215-V03-24-page5.txt: [(' d u r i n g ', 'during')]
GS18881215-V03-24-page6.txt: [('It', 'It')]
In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/correction5

Average verified rate: 0.9758713136729222

Average of error rates: 0.024798214285714288

Total token count: 1275287

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[("'", 2240),
 ('t', 767),
 ('d', 745),
 ('e', 544),
 ('w', 522),
 ('r', 485),
 ('f', 402),
 ('m', 385),
 ('n', 334),
 ('g', 332),
 ('th', 247),
 ("of'", 218),
 ('ex', 197),
 ('u', 197),
 ("the'", 178),
 ('re', 166),
 ('x', 160),
 ('pp', 119),
 ('aro', 111),
 ('co', 92),
 ('eze', 75),
 ('ti', 68),
 ("and'", 66),
 ('k', 66),
 ('bo', 62),
 ('wo', 59),
 ('q', 53),
 ('tion', 50),
 ("to'", 49),
 ('ots', 48),
 ('mo', 46),
 ('ft', 44),
 ('pre', 42),
 ("''", 41),
 ("in'", 40),
 ('wm', 40),
 ('mt', 37),
 ('nd', 36),
 ('al', 32),
 ('se', 32),
 ('ment', 32),
 ('es', 31),
 ('mal', 30),
 ('pa', 30),
 ("saints'", 30),
 ("that'", 29),
 ('goapxl', 27),
 ('ay', 27),
 ('iu', 27),
 ('il', 26)]

Correction 6 -- Rejoin Split Words

In [28]:
# %load shared_elements/rejoin_split_words.py
prev = "correction5"
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
GS18860201-V01-01-page2.txt: [('es', 'pecially')]
GS18860201-V01-01-page8.txt: [('co', 'operation'), ('re', 'examined')]
GS18860215-V01-02-page1.txt: [('Bi', 'ble'), ('mo', 'e')]
GS18860215-V01-02-page3.txt: [('M.', '')]
GS18860215-V01-02-page4.txt: [('re', 'examination')]
GS18860215-V01-02-page6.txt: [('co', 'operate'), ('wi', 'th'), ('gos', 'pel')]
GS18860215-V01-02-page7.txt: [('Mor', 'ton')]
GS18860215-V01-02-page8.txt: [('bi', 'monthly'), ('co', 'worker'), ('re', 'echoed')]
GS18860301-V01-03-page1.txt: [('hu', 'manity'), ('Conti', 'nent')]
GS18860301-V01-03-page3.txt: [('sugges', 'tion'), ('experi', 'ment')]
GS18860301-V01-03-page5.txt: [('wor', 'shiped'), ('mUr', 'der')]
GS18860301-V01-03-page6.txt: [('wa', 's')]
GS18860301-V01-03-page7.txt: [('TH', 'E')]
GS18860301-V01-03-page8.txt: [('Nebu', "chadnezzar's")]
GS18860315-V01-04-page1.txt: [('Re', 'in')]
GS18860315-V01-04-page2.txt: [('ne', 'man'), ('th', 'e')]
GS18860315-V01-04-page4.txt: [('dei', 'L'), ('ac', 'count'), ('se', 'a')]
GS18860315-V01-04-page5.txt: [('re', 'lies'), ('es', 'tablish'), ('th', 'a')]
GS18860315-V01-04-page7.txt: [('un', 'der'), ('suc', 'cessfully')]
GS18860315-V01-04-page8.txt: [('estab', 'lishments'), ('pedo', 'Baptists'), ('M.', ''), ('stimu', 'lated')]
GS18860401-V01-05-page1.txt: [('PA', 'm'), ('AC', 'CORDING'), ('thes', 'es')]
GS18860401-V01-05-page4.txt: [('har', 'mony'), ('G.', '')]
GS18860401-V01-05-page6.txt: [('ch', 'on'), ('indi', 'vidual'), ('inhab', 'itants')]
GS18860401-V01-05-page7.txt: [('hea', 'ven'), ('Scole', 's'), ('re', 'Ported'), ('sa', 'y')]
GS18860401-V01-05-page8.txt: [('posi', 'tions')]
GS18860415-V01-06-page1.txt: [('op', 'ed')]
GS18860415-V01-06-page2.txt: [('fi', 'B'), ('na', 'N')]
GS18860415-V01-06-page4.txt: [('pre', 'eminence')]
GS18860415-V01-06-page7.txt: [('re', 'ported')]
GS18860501-V01-07-page1.txt: [('ev', 'ery'), ('YA', 'N'), ('ex', 'tent')]
GS18860501-V01-07-page2.txt: [('sk', 'i'), ('re', 'ached')]
GS18860501-V01-07-page3.txt: [('eter', 'nity'), ('forw', 'ard')]
GS18860501-V01-07-page4.txt: [('M.', '')]
GS18860501-V01-07-page5.txt: [('ern', 'a')]
GS18860501-V01-07-page6.txt: [('yo', 'u')]
GS18860515-V01-08-page1.txt: [('re', 'ascend')]
GS18860515-V01-08-page3.txt: [('touchin', 'g')]
GS18860515-V01-08-page5.txt: [('resurrec', 'tion')]
GS18860515-V01-08-page6.txt: [('TES', 'TAMENT'), ('anx', 'iously')]
GS18860515-V01-08-page8.txt: [('Fr', 'It')]
GS18860601-V01-09-page1.txt: [('cour', 'age')]
GS18860601-V01-09-page2.txt: [('Se', 'A'), ('Mo', 'ses')]
GS18860601-V01-09-page3.txt: [('experi', 'ence')]
GS18860601-V01-09-page5.txt: [('sev', 'enth'), ('insti', 'tutiOn')]
GS18860601-V01-09-page7.txt: [('desti', 'tute')]
GS18860615-V01-10-page2.txt: [('ut', 'e'), ('nei', 'ther'), ('ful', 'fill')]
GS18860615-V01-10-page3.txt: [('re', 'constituted')]
GS18860615-V01-10-page5.txt: [('nowa', 'days'), ('M.', '')]
GS18860615-V01-10-page8.txt: [('coun', 'try')]
GS18860701-V01-11-page2.txt: [('astro', 'nomical')]
GS18860701-V01-11-page4.txt: [('oll', 'a'), ('WeSle', 'yan'), ('hei', 'nous')]
GS18860701-V01-11-page6.txt: [('RE', 'MEMBER'), ('re', 'inforced')]
GS18860701-V01-11-page7.txt: [('Deca', 'tur'), ('co', 'operation')]
GS18860701-V01-11-page8.txt: [('co', 'laborers'), ('applica', 'ble')]
GS18860715-V01-12-page1.txt: [('pre', 'existence'), ('COMMI', 'T')]
GS18860715-V01-12-page7.txt: [('inten', 'sity')]
GS18860715-V01-12-page8.txt: [('em', 'It'), ('BI', 'BLE')]
GS18860801-V01-13-page6.txt: [('calam', 'ities')]
GS18860815-V01-14-page4.txt: [('re', 'enacted')]
GS18860815-V01-14-page5.txt: [('fr', 'om')]
GS18860901-V01-15-page3.txt: [('resurrec', 'tion')]
GS18860901-V01-15-page6.txt: [('Mc', 'KAY')]
GS18860901-V01-15-page7.txt: [('Ti', 'es')]
GS18860915-V01-16-page1.txt: [('Ev', 'idently')]
GS18860915-V01-16-page2.txt: [('pri', 'est'), ('Sanctu', 'ary')]
GS18860915-V01-16-page3.txt: [('resurrec', 'tion'), ('Jeru', 'Salem')]
GS18860915-V01-16-page6.txt: [('CHRI', 'STIAN')]
GS18861001-V01-17-page1.txt: [('immor', 'tality'), ('CO', 'M')]
GS18861001-V01-17-page2.txt: [('ro', 'x')]
GS18861001-V01-17-page3.txt: [('co', 'laborer'), ('re', 'affirmed')]
GS18861001-V01-17-page4.txt: [('re', 'jects')]
GS18861001-V01-17-page6.txt: [('pre', 'existent')]
GS18861001-V01-17-page8.txt: [('va', 'cation'), ('pre', 'eminently')]
GS18861015-V01-18-page1.txt: [('re', 'made')]
GS18861015-V01-18-page3.txt: [('re', 'surrection')]
GS18861015-V01-18-page5.txt: [('bo', 'real'), ('re', 'constructed')]
GS18861015-V01-18-page7.txt: [('wi', 'de'), ('Ti', 's')]
GS18861015-V01-18-page8.txt: [('re', 'enacted')]
GS18861101-V01-19-page1.txt: [('pre', 'existing'), ('kR', 'IS')]
GS18861101-V01-19-page2.txt: [('re', 'establishment')]
GS18861101-V01-19-page5.txt: [('G.', '')]
GS18861101-V01-19-page6.txt: [('deca', 'Logue')]
GS18861115-V01-20-page2.txt: [('DOCTRI', 'NAL')]
GS18861115-V01-20-page4.txt: [('re', 'ligion')]
GS18861115-V01-20-page6.txt: [('Mi', 'r')]
GS18861115-V01-20-page8.txt: [('Wi', 'thout')]
GS18861208-V01-22-page4.txt: [('Ka', 'ma')]
GS18861208-V01-22-page8.txt: [('giv', 'e'), ('ol', 'es'), ('WI', 'en'), ('re', 'garded')]
GS18861215-V01-23-page1.txt: [('leav', 'ing')]
GS18861215-V01-23-page3.txt: [('cer', 'tainly')]
GS18861215-V01-23-page7.txt: [('co', 'operate'), ('Mc', 'Donald'), ('Th', 'ree'), ('re', 'establishment'), ('dili', 'gent'), ('prin', 'ciple')]
GS18861222-V01-24-page5.txt: [('se', 'mi')]
GS18870101-V02-01-page2.txt: [('OB', 'SERVED')]
GS18870101-V02-01-page4.txt: [('foreknowl', 'edge'), ('inevit', 'able')]
GS18870101-V02-01-page5.txt: [('re', 'quires')]
GS18870101-V02-01-page8.txt: [('PAPE', 'R')]
GS18870115-V02-02-page1.txt: [('pre', 'eminence'), ('re', 'enact')]
GS18870115-V02-02-page5.txt: [('co', 'operate'), ('wor', 'shipers'), ('attendan', 't')]
GS18870115-V02-02-page6.txt: [('co', 'operation'), ('re', 'modeled')]
GS18870201-V02-03-page1.txt: [('mo', 'I'), ('th', 'a')]
GS18870201-V02-03-page5.txt: [('co', 'extensive')]
GS18870201-V02-03-page6.txt: [('EXPLANA', 'TION')]
GS18870201-V02-03-page7.txt: [('co', 'operation')]
GS18870215-V02-04-page1.txt: [('beh', 'old')]
GS18870215-V02-04-page6.txt: [('co', 'operation'), ('mal', 'administration'), ('Capel', 'in')]
GS18870215-V02-04-page7.txt: [('prac', 'tice')]
GS18870301-V02-05-page3.txt: [('co', 'laborers'), ('bo', 'no')]
GS18870301-V02-05-page5.txt: [('co', 'operation')]
GS18870301-V02-05-page6.txt: [('co', 'operation')]
GS18870301-V02-05-page7.txt: [('ple', 'a')]
GS18870315-V02-06-page1.txt: [('evangeli', 'cal')]
GS18870315-V02-06-page3.txt: [('sacri', 'fice')]
GS18870315-V02-06-page4.txt: [('re', 't')]
GS18870315-V02-06-page5.txt: [('pre', 'eminent')]
GS18870315-V02-06-page7.txt: [('ch', 'at')]
GS18870315-V02-06-page8.txt: [('applIca', 'ble'), ('ex', 'pressed'), ('th', 'e')]
GS18870401-V02-07-page1.txt: [('ti', 'e'), ('OD', 'in'), ('th', 'e')]
GS18870401-V02-07-page2.txt: [('conse', 'quent'), ('ag', 'riculture'), ('re', 'creating')]
GS18870401-V02-07-page5.txt: [('co', 'workers')]
GS18870401-V02-07-page7.txt: [('preachin', 'g'), ('re', 'admission')]
GS18870415-V02-08-page1.txt: [('eter', 'nal'), ('PRE', 't')]
GS18870415-V02-08-page2.txt: [('ut', 'a')]
GS18870415-V02-08-page3.txt: [('re', 'echo')]
GS18870415-V02-08-page5.txt: [('re', 'penting')]
GS18870415-V02-08-page6.txt: [('co', 'operate'), ('destr', 'oyed')]
GS18870415-V02-08-page7.txt: [('Mc', 'Clure')]
GS18870415-V02-08-page8.txt: [('institu', 'tion'), ('impressiv', 'e')]
GS18870501-V02-09-page1.txt: [('re', 'T')]
GS18870501-V02-09-page5.txt: [('re', 'enforced')]
GS18870501-V02-09-page6.txt: [('co', 'operation'), ('re', 'enforcement')]
GS18870501-V02-09-page7.txt: [('Mc', 'Cutchen'), ('ac', 'Cessions')]
GS18870501-V02-09-page8.txt: [('M.', ''), ('Mose', 's'), ('Thermo', 'Electric')]
GS18870515-V02-10-page1.txt: [('ex', 'pressed')]
GS18870515-V02-10-page4.txt: [('pre', 'millennial'), ('har', 'Mony')]
GS18870515-V02-10-page6.txt: [('ro', 'n')]
GS18870515-V02-10-page7.txt: [('co', 'religionists')]
GS18870515-V02-10-page8.txt: [('Bi', 'ble'), ('necessi', 'tates')]
GS18870601-V02-11-page7.txt: [('un', 'dersell')]
GS18870615-V02-12-page6.txt: [('hea', 'Ven'), ('neith', 'er'), ('li', 'D')]
GS18870615-V02-12-page8.txt: [('giv', 'ing')]
GS18870701-V02-13-page4.txt: [('co', 'eval'), ('unscript', 'ural')]
GS18870701-V02-13-page5.txt: [('re', 'echo')]
GS18870701-V02-13-page6.txt: [('Mc', 'Mullen'), ('Cy', 'clopedia')]
GS18870701-V02-13-page7.txt: [('senti', 'ment')]
GS18870701-V02-13-page8.txt: [('applica', 'ble')]
GS18870715-V02-14-page3.txt: [('pil', 'grim')]
GS18870715-V02-14-page6.txt: [('sh', 'all')]
GS18870715-V02-14-page7.txt: [('Mc', 'Clure'), ('re', 'Coveries')]
GS18870715-V02-14-page8.txt: [('libe', 'rty'), ('ac', 'e'), ('co', 'nscience'), ('imProp', 'erly'), ('Thermo', 'Electric')]
GS18870801-V02-15-page1.txt: [('frie', 's'), ('cov', 'er'), ('re', 'born')]
GS18870801-V02-15-page2.txt: [('ny', 'man'), ('rul', 'e')]
GS18870801-V02-15-page4.txt: [('re', 'ward')]
GS18870801-V02-15-page5.txt: [('provisio', 'n'), ('fo', 'r'), ('th', 'e')]
GS18870801-V02-15-page6.txt: [('antedilu', 'Vians'), ('re', 'organize')]
GS18870801-V02-15-page7.txt: [('co', 'operate'), ('re', 'mainder')]
GS18870801-V02-15-page8.txt: [('receivi', 'ng'), ('ta', 'I'), ('grea', 't'), ('Thermo', 'Electric')]
GS18870815-V02-16-page1.txt: [('pre', 'eminent')]
GS18870815-V02-16-page3.txt: [('ex', 'ceeding')]
GS18870815-V02-16-page4.txt: [('pre', 'eminent'), ('prin', 'ciples')]
GS18870815-V02-16-page7.txt: [('Te', 't'), ('co', 'operation'), ('th', 'e')]
GS18870901-V02-17-page1.txt: [('dif', 'ference')]
GS18870901-V02-17-page2.txt: [('toget', 'her')]
GS18870901-V02-17-page3.txt: [('re', 'creation')]
GS18870901-V02-17-page4.txt: [('re', 'veal')]
GS18870901-V02-17-page6.txt: [('M.', ''), ('re', 'enactment')]
GS18870901-V02-17-page7.txt: [('aggressiv', 'e')]
GS18870901-V02-17-page8.txt: [('pre', 'conceived')]
GS18870915-V02-18-page3.txt: [('co', 'workers')]
GS18870915-V02-18-page6.txt: [('pre', 'existence')]
GS18870915-V02-18-page8.txt: [('al', 'va'), ('Thermo', 'Electric')]
GS18871001-V02-19-page1.txt: [('ne', 'ed'), ('bo', 'As')]
GS18871015-V02-20-page2.txt: [('na', 'tions'), ('direc', 'tion')]
GS18871101-V02-21-page5.txt: [('pre', 'eminently'), ('co', 'operation')]
GS18871101-V02-21-page7.txt: [('co', 'operation'), ('ti', 'ng')]
GS18871101-V02-21-page8.txt: [('co', 'operation'), ('Thermo', 'Electric')]
GS18871115-V02-22-page1.txt: [('AMA', 'DON')]
GS18871115-V02-22-page2.txt: [('rA', 't'), ('al', 'I')]
GS18871115-V02-22-page3.txt: [('gra', 'il'), ('mo', 'ney')]
GS18871115-V02-22-page4.txt: [('ple', 'as')]
GS18871115-V02-22-page5.txt: [('CO', 'M')]
GS18871115-V02-22-page6.txt: [('re', 'read')]
GS18871115-V02-22-page7.txt: [('Re', 'form'), ('PROHIBI', 'TION')]
GS18871201-V02-23-page1.txt: [('Om', 'a')]
GS18871201-V02-23-page3.txt: [('re', 'appears')]
GS18871201-V02-23-page4.txt: [('applica', 'tions')]
GS18871201-V02-23-page8.txt: [('si', 'A'), ('Es', 'pecially')]
GS18871215-V02-24-page1.txt: [('hu', 'man'), ('bo', 'a')]
GS18871215-V02-24-page3.txt: [('Gos', 'PEL')]
GS18871215-V02-24-page4.txt: [('co', 'operated')]
GS18871215-V02-24-page6.txt: [('co', 'operation')]
GS18871215-V02-24-page7.txt: [('si', 'x'), ('Mc', 'Coy')]
GS18871215-V02-24-page8.txt: [('co', 'operation'), ('re', 'read'), ('th', 'e')]
GS18880101-V03-01-page1.txt: [('WHER', 'EFORE')]
GS18880101-V03-01-page3.txt: [('TRU', 'E')]
GS18880101-V03-01-page4.txt: [('ti', 'e'), ('re', 'animated')]
GS18880101-V03-01-page5.txt: [('pre', 'eminent')]
GS18880101-V03-01-page7.txt: [('un', 'stimulated')]
GS18880115-V03-02-page8.txt: [('re', 'reading')]
GS18880201-V03-03-page8.txt: [('co', 'operation')]
GS18880215-V03-04-page2.txt: [('compara', 'tively')]
GS18880215-V03-04-page4.txt: [('pre', 'eminently')]
GS18880215-V03-04-page7.txt: [('PENNSYL', 'VANIA')]
GS18880215-V03-04-page8.txt: [('excom', 'munication'), ('Thermo', 'Electric')]
GS18880301-V03-05-page6.txt: [('proph', 'ecies'), ('ap', 'petites')]
GS18880301-V03-05-page7.txt: [('re', 'ceived')]
GS18880315-V03-06-page1.txt: [('Re', 'former')]
GS18880315-V03-06-page8.txt: [('Thermo', 'Electric'), ('re', 'published')]
GS18880401-V03-07-page7.txt: [('Mc', 'Clelland'), ('pre', 'existence')]
GS18880401-V03-07-page8.txt: [('re', 'iterated')]
GS18880415-V03-08-page5.txt: [('re', 'newed')]
GS18880415-V03-08-page8.txt: [('co', 'laborers'), ('Thermo', 'Electric')]
GS18880501-V03-09-page1.txt: [('re', 'christened')]
GS18880501-V03-09-page4.txt: [('Ch', 'ristian'), ('re', 'adjust')]
GS18880501-V03-09-page7.txt: [('Mc', "Bride's")]
GS18880515-V03-10-page1.txt: [('CO', 'MMITTEE')]
GS18880515-V03-10-page3.txt: [('ti', 'e')]
GS18880515-V03-10-page7.txt: [('od', 'is')]
GS18880615-V03-12-page2.txt: [('PRE', 'EXISTENCE')]
GS18880615-V03-12-page3.txt: [('pre', 'existed'), ('ABRA', 'HAM')]
GS18880615-V03-12-page4.txt: [('previ', 'ous')]
GS18880615-V03-12-page6.txt: [('Fa', 'thers')]
GS18880615-V03-12-page7.txt: [('recom', 'mendation')]
GS18880701-V03-13-page2.txt: [('ex', 'changed')]
GS18880701-V03-13-page4.txt: [('hu', 'Man')]
GS18880701-V03-13-page6.txt: [('Se', 't')]
GS18880701-V03-13-page7.txt: [('co', 'operation')]
GS18880701-V03-13-page8.txt: [('pre', 'eminence'), ('re', 'enacted')]
GS18880715-V03-14-page4.txt: [('co', 'operation')]
GS18880715-V03-14-page8.txt: [('th', 'u')]
GS18880801-V03-15-page3.txt: [('co', 'extensive')]
GS18880801-V03-15-page4.txt: [('ob', 'tain')]
GS18880801-V03-15-page6.txt: [('dal', 'es')]
GS18880801-V03-15-page7.txt: [('Mc', 'Coy')]
GS18880801-V03-15-page8.txt: [('py', 'a')]
GS18880815-V03-16-page6.txt: [('indi', 'vidual')]
GS18880815-V03-16-page8.txt: [('fl', 'it'), ('ap', 'pointments'), ('su', 'Persede')]
GS18880901-V03-17-page1.txt: [('se', 'A')]
GS18880901-V03-17-page3.txt: [('Re', 'heard'), ('HOMESP', 'UN')]
GS18880901-V03-17-page4.txt: [('Tuscu', 'lum')]
GS18880901-V03-17-page5.txt: [('compan', 'ions')]
GS18880901-V03-17-page6.txt: [('TEM', 'PERANCE')]
GS18880901-V03-17-page7.txt: [('re', 'established')]
GS18880901-V03-17-page8.txt: [('Thermo', 'Electric'), ('ex', 'pressed')]
GS18880915-V03-18-page4.txt: [('co', 'operation')]
GS18880915-V03-18-page7.txt: [('mythol', 'ogy')]
GS18880915-V03-18-page8.txt: [('BO', 'ors'), ('Es', 'tablished')]
GS18881001-V03-19-page1.txt: [('MONTHL', 'Y')]
GS18881001-V03-19-page4.txt: [('re', 'Frain')]
GS18881001-V03-19-page8.txt: [('Thermo', 'Electric'), ('ex', 'plicit')]
GS18881015-V03-20-page1.txt: [('fi', 'N'), ('co', 'laborers')]
GS18881015-V03-20-page4.txt: [('preten', 'd')]
GS18881015-V03-20-page5.txt: [('co', 'laborers'), ('condi', 'tion'), ('al', 'way')]
GS18881015-V03-20-page6.txt: [('obedi', 'ent')]
GS18881015-V03-20-page7.txt: [('Mc', 'Allister'), ('funda', 'mental')]
GS18881015-V03-20-page8.txt: [('Un', 'til')]
GS18881101-V03-21-page1.txt: [('soo', 'the')]
GS18881101-V03-21-page3.txt: [('re', 'instate')]
GS18881101-V03-21-page4.txt: [('hearin', 'g'), ('th', 'e')]
GS18881101-V03-21-page5.txt: [('ow', 'n')]
GS18881101-V03-21-page6.txt: [('ti', 'e'), ('re', 'read')]
GS18881101-V03-21-page7.txt: [('pre', 'existence'), ('clo', 'the')]
GS18881101-V03-21-page8.txt: [('re', 'ad')]
GS18881115-V03-22-page1.txt: [('CO', 'M')]
GS18881115-V03-22-page2.txt: [('pre', 'existence')]
GS18881115-V03-22-page5.txt: [('ac', 'cording')]
GS18881115-V03-22-page7.txt: [('sevent', 'h'), ('re', 'enactment')]
GS18881115-V03-22-page8.txt: [('co', 'operate'), ('HERAL', 'D'), ('Thermo', 'Electric')]
GS18881201-V03-23-page1.txt: [('Ca', 'n')]
GS18881201-V03-23-page2.txt: [('pre', 'eminently')]
GS18881201-V03-23-page3.txt: [('ow', 'e'), ('lil', 't')]
GS18881201-V03-23-page6.txt: [('un', 'changeable'), ('WIl', 'Y'), ('re', 'spect')]
GS18881201-V03-23-page7.txt: [('Co', 'operate'), ('co', 'operation'), ('li', 'm')]
GS18881201-V03-23-page8.txt: [('te', 'at'), ('BO', 'ors'), ('Merl', 'In')]
GS18881215-V03-24-page1.txt: [('lif', 'E'), ('re', 'establishing')]
GS18881215-V03-24-page4.txt: [('re', 'ligion')]
GS18881215-V03-24-page5.txt: [('re', 'instatement')]
GS18881215-V03-24-page8.txt: [('Si', 'S'), ('re', 'sent')]
In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/correction6

Average verified rate: 0.9761960041163625

Average of error rates: 0.024453571428571425

Total token count: 1274912

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[("'", 2240),
 ('t', 757),
 ('d', 744),
 ('e', 531),
 ('w', 522),
 ('r', 483),
 ('f', 402),
 ('m', 383),
 ('g', 331),
 ('n', 328),
 ('th', 241),
 ("of'", 218),
 ('u', 197),
 ('ex', 193),
 ("the'", 178),
 ('x', 159),
 ('pp', 119),
 ('aro', 110),
 ('re', 105),
 ('eze', 75),
 ("and'", 66),
 ('k', 66),
 ('ti', 61),
 ('wo', 59),
 ('bo', 57),
 ('q', 53),
 ("to'", 49),
 ('ots', 48),
 ('tion', 44),
 ('ft', 44),
 ('mo', 41),
 ("''", 41),
 ('co', 40),
 ("in'", 40),
 ('wm', 40),
 ('mt', 37),
 ('nd', 36),
 ('ment', 32),
 ("saints'", 30),
 ('mal', 29),
 ("that'", 29),
 ('pa', 29),
 ('al', 29),
 ('goapxl', 27),
 ('ay', 27),
 ('iu', 27),
 ('se', 27),
 ('il', 26),
 ('es', 25),
 ('gospxi', 24)]

Review Remaining Errors

In [31]:
# %load shared_elements/high_error_rates.py
reports.docs_with_high_error_rate(summary)

doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.2]

# utilities.open_original_docs(doc_keys, directories['cycle'])
Opened files: 

GS18870801-V02-15-page8.txt

In [32]:
reports.long_errors(errors_summary, min_length=15)[:50]
Out[32]:
(["steamship'nevada",
  'governtheritlialunderstood',
  'hferoimnyasbaill',
  'germanstatiatichni',
  'tehcommandthents',
  'tioanldutahteuirecaseisfarvererycogrodoed',
  'wasthefirstwritten',
  'bethoughtthemselveswhatagrandthingitwouldbe',
  'biblesaysdistinctly',
  'bblelesssininggss',
  'llllllllllllllllllllllllllllllll',
  'consciencestricken',
  'snpererdgationompe',
  'suintnattetarguments',
  'unitedtestimonies',
  'whisicyclisiilling',
  'thatwasanageofrites',
  'thetransgresaion',
  'inisgsrighteousness',
  'ttjdtzeotgrttrtttein',
  'uponlookingoverthe',
  'irelesedcogrdinccosiwihichofaaerisi',
  'buttheapostlepaul',
  "this'ilay'on'which",
  "lidfilled'except",
  'cerietitutimielity',
  'theliiiiiittyilontlif',
  'filltifineettpled',
  "michigan'conference",
  'cirotgati-tmlitioliptition',
  'thwartedprianttice',
  'repeatedwarnings',
  "twile'lesetiitdr",
  're-establishment',
  '-absoluteprohibition',
  "neviteri''ociiiiit",
  "ionalmissionary's",
  'thelordwishedtoassurehishearersthe',
  'christurniitionedio',
  'rfothwerminisenlist',
  'poraljudgmentswerethusbroughtuponthem',
  "decelviir'swerdsi",
  'tritining-sahool',
  'aivayintfreverlasting',
  'theinterrogatory',
  'encouragingreports',
  "biit'leveryb'odyltiows",
  "christianity'did",
  'trseaslsalryythatalhienm',
  'dayofrestandholiness',
  "organizations'on",
  "conqii'efiikiatill",
  'gernmerthemselves',
  'nuatahneictuesnmitang',
  "ourselves'plaeed",
  "ministers'advocate",
  'utterlyastonishing',
  'nomommeoammammolio',
  'editorialcommittee',
  'thepromisesofgodwillallberedeemed',
  'nbdolymoaraeftahra',
  'twenty-sevetfortliern',
  'sainnigrtrontrtmurlaws',
  'unaccountableness',
  "earthly'sanctnary",
  'worshipcongregational',
  "heavenly'sanctuary",
  'tthrroughendless',
  'containedprovisions',
  'post-mortem-probation',
  'immortal-soulists',
  "hinderances'caused",
  'butimagineoursurprise',
  'worldisymbelizes',
  'iiiiiiiiiiiiiiiiiiiiiiiii',
  'withthefourthcommandment',
  'iselftptifeisecond',
  'resultasincorrect',
  'mairititaaianded',
  'thelordjesuschristsiemlf',
  'fourthcommandment',
  'intocondemnation',
  'propertyaggregates',
  'shalftlyeptireli',
  'twenty-four-page',
  'itejestablishment',
  'titrocigtagainst',
  'goodpeopleshouldtryto',
  'slmultaneelusjya',
  'benoreasontodoubtthattheyknowthatmanyoftheoatholic',
  'annihilationists',
  'imemeedieadteilnyepenrosicneedaedemtothwartthteisamthe',
  'panapitietbeiring',
  'illeteoulttidoiit',
  'citthelictlitireh',
  'heiuleiasyhrjklildgeselt',
  'necessajustifying',
  'jjjjjjjememokfind',
  'onacabryefoeorrneedmof',
  "atir'rehfleriwillflhonehof",
  'casionstoinsistontheonemotive',
  'parentsandnrtaadvgaes',
  'weshallbeclaspedinanembracesuchaswehave',
  "is'particularized",
  'one-thousand-dollar',
  "announcenient'inade",
  'partychristianity',
  'rtebsouirresextoioen',
  'folding-machines',
  'slaughter-houses',
  "on'ttliceohrist's",
  'whheennhhisisyoke',
  "formerasseveration'",
  'witnessesagainst',
  'ainovustainlifialma',
  "encouraging'report",
  "worldls'eoncerned",
  'wholeranugeeooffhumandevices',
  'rationautreatiient',
  'socialiabilities',
  'mustbeitsmeasure',
  "grand'counterfeit",
  'andtothedoctrinewhichisaccordingtogodliness',
  'btilibelosedcocntps',
  'seventh-part-of-time',
  'theexaminationofthings',
  'forthechildrenofwantarein',
  'christianinstructed',
  'istoliovesnolaulson',
  'thinationandjall',
  'serreenralanfleldnesenor',
  'vongregntoziasays',
  'doetorofdivinity',
  'judglinjiftkoidrikid',
  'cartluittakfthen',
  'intermollegiatecoavention',
  'chrificdestroysall',
  'dayisimentionecl',
  'advancedforadopting',
  'whosvlawwehavetransgrossed',
  "therefere'deaerving",
  'ebrlinugrdagraeib',
  'stateratherthanoft',
  'mannefiopharacters',
  'anaerialparadise',
  'crushetheomplain',
  'sitcalisailitarium',
  'thefirstdayofapriliscelebrated',
  'boarding-schools',
  "welllt'eulgdaleitaly",
  'througlksighteen',
  'filiaitieoffikfb',
  'tecdprioesielminprortant',
  'itthusbecomesself',
  "physical'creation",
  'esidteaccompliched',
  'responaibilities',
  'cametheirinevitableportion',
  'inphilosolilikak',
  'hagibeentplistlied',
  "believers'reported",
  'pennsylvaniapference',
  'self-contradictory',
  "american'tractsocietysaysitis",
  'materializations',
  'writingintheindependentconcerningtheeromancatholic',
  "thenational'organization",
  'sanetioaedbyhthrewieritios',
  'stthjegtornewhat',
  'plaitoinishndersitanding',
  'itisthisdiscordwhichdisgustspeople',
  'fortheotherwecanbut',
  'whoshalljudgethequickandthedeadat',
  'influeocemothing',
  'countermarchings',
  'divinelfoppointed',
  'morabsensibilities',
  'tircirfalfillment',
  'unstatesraanlike',
  'religiousteachers',
  'wellauthenticated',
  'delightthemselves',
  'detpawrteinagwas',
  'stonebythefingerofgod',
  'detrhveeidlnwtoions',
  'dhassolemnlydeclaredthatthere',
  'butunconditional',
  'tieforhispiallnowdsj',
  'iteoptelioepleleav',
  'sermonfrommanuscriptitoughttobeseefetfromthecongre',
  'ofitasamanofmoderatedesiresmayknowhowto',
  'whoshallchangeourvilebyo',
  'probablytheruleofou',
  "faoarevearsalf'tearabge",
  'thedecommonschobls',
  'forinterpretatitan',
  'unalloyedsluster',
  'bariereiterpriiiitahavhiss',
  'pliteiitvigndilfvihi',
  'articlesprepared',
  'whatanoversightinthelord',
  'ickednessanddegradaare',
  "fulfilled'hefere",
  'exampleofbaseringratitudeorofgreaterinconsistency',
  'christianography',
  'sptodtepartitefooa',
  'tifteiiadlyeenahioug',
  'asdeeplyrootedand',
  "hinrian'illeeha'disease",
  'theircaondncsocmienmcaen',
  'notwithvennttchhiimomfmroming',
  'instrurnentalities',
  'advehisementstere',
  'therewasauniversalistupon',
  'the-resurrection',
  'annihilaearliest',
  "disappointmeut'which",
  'legislationunder',
  "whydidn'thesayso",
  'briadinhatvoinmarry',
  'evetieletiinrile',
  'disgracedtheitem',
  'postmaster-general',
  "n'ibuneestimates",
  'forciblyillustrated',
  'thseentsikleiallonalloithmeantreiboenslitowf',
  'andbothwerewrittenbythesameapostle',
  'self-satisfaction',
  'andifsatancastoutsatan',
  'hewillnotbesubjteocatsecond',
  'rconciletheirprofession',
  'vtothiveetabsletse',
  'thesunandtherendingofthe',
  'terriblethreatenings',
  'instrumenttoward',
  'tabootfhaefhoeurnadbaetflorne',
  'andiimportfanocnlolecewoifntheweek',
  "deatruotive'thili",
  "sevendaylteuld'he",
  'biedteovotitiedptroesiemnrotrnsent',
  'maaoansitihenretiise',
  'irhceonversation',
  "claiming'support",
  'dlecontlitstance',
  "intelligence'pum",
  "e'excommunicated",
  'easilyascertained',
  'everythingwhichgdosendsyou',
  "reigns'supremely",
  'acceptaeblobedience',
  'otlierinatitutfon',
  'innevbeornlivhat',
  'ettehotpreaolrlikopflul',
  'bescohamllertehiegnkirogredvoemrsaonfdoeuverrl',
  'understaltrypiteetipeas',
  'nuifinfiactiarers',
  'youcannotaffordtobeamanofa',
  'themanisdrawntocome',
  'heaven-appointed',
  'theophilanthropy',
  "missiqivary'society",
  'overtheifiilearimth',
  'thathemaydipthetipofhisfingerin',
  "evidentthatifone's",
  'weestablishthelaw',
  'thebranchesofthefig',
  'selfcontradiction',
  "ofthequ'lessitrtii",
  'reilnrrectiontlay',
  'itihaenndynonudfcixi',
  'efteonamunioateu',
  'disincorporation',
  'leisureafterwards',
  'fellow-reformers',
  'recotninendation',
  'ealinotlaylalievrnin',
  'gomregationatfst',
  'judgekintelvnelutibttle',
  'awfutdescription',
  "father'slcommandments",
  "hidden'retailheiv",
  'feridriazqtofsalvation',
  'tlivnerthrgeleigirs',
  "notwithstanding'",
  'followingquestion',
  'heraldpublishing',
  'especiallyjnteresting',
  'thathedidnottell',
  "sithilarly'hiessa",
  'sacrainentarianism',
  'aknetietphetsilis',
  "whatauthority'does",
  'heavoillyitfather',
  'apostlesthemselves',
  'andsubstitutessundayinitsplace',
  'circunistttnersuutkc',
  'ataliformsisismeisseetesliias',
  'thetodelerstesthceomi',
  'thisisonlyinfulfillmentofthecharacteristicsgiven',
  'consistsustaining',
  'thriunchangeable',
  'rhieesettohfetheiseoerecones',
  'sunratednotothee',
  'thatisintheveryself',
  'meminisvimminimm',
  'ginmeavteigaayst',
  'spiritual-minded',
  'willnotevenhavethecompanionshipofthosewho',
  'rerxicnecedoinfg',
  'congreyotiorialials',
  'itsiiversaiisible',
  "imploreafather'slove",
  'temporalmillennium',
  "new'lefibreh'jtikeotifiilete",
  'supremelovetogodandequallovetoourfellowthemythicgorgeousnessofgenuineoldheathenism'],
 15)

Correction 7 -- Split Squashed Errors

In [33]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
GS18860301-V01-03-page1.txt: [('forciblyillustrated', 'forcibly illustrated'), ('terriblethreatenings', 'terrible threatenings')]
GS18860401-V01-05-page1.txt: [('theexaminationofthings', 'the examination of things')]
GS18860501-V01-07-page2.txt: [('containedprovisions', 'contained provisions')]
GS18860501-V01-07-page7.txt: [('iiiiiiiiiiiiiiiiiiiiiiiii', 'iii iii iii iii iii iii iii iii i')]
GS18860501-V01-07-page8.txt: [('Thisisonlyinfulfillmentofthecharacteristicsgiven', 'This is only in fulfillment of the characteristics given'), ('TheLordwishedtoassurehishearersthe', 'The Lord wished to assure his hearers the')]
GS18860601-V01-09-page1.txt: [('Biblesaysdistinctly', 'Bible says distinctly'), ('thebranchesofthefig', 'the branches of the fig')]
GS18860701-V01-11-page1.txt: [('EDITORIALCOMMITTEE', 'EDITORIAL COMMITTEE')]
GS18860901-V01-15-page5.txt: [('poraljudgmentswerethusbroughtuponthem', 'p oral judgments were thus brought upon them')]
GS18860915-V01-16-page4.txt: [('utterlyastonishing', 'utterly astonishing')]
GS18861001-V01-17-page7.txt: [('propertyaggregates', 'property aggregate s')]
GS18861001-V01-17-page8.txt: [('andtothedoctrinewhichisaccordingtogodliness', 'and to the doctrine which is according to godliness')]
GS18861101-V01-19-page6.txt: [('andbothwerewrittenbythesameapostle', 'and both were written by the same apostle')]
GS18861115-V01-20-page1.txt: [('cametheirinevitableportion', 'came their inevitable portion')]
GS18861208-V01-22-page1.txt: [('Itisthisdiscordwhichdisgustspeople', 'It is this discord which dis g u s t s p e o p l e')]
GS18861215-V01-23-page5.txt: [('thathemaydipthetipofhisfingerin', 'that he may dip the tip of his finger in')]
GS18861215-V01-23-page6.txt: [('Thatwasanageofrites', 'That was an age of rites')]
GS18870101-V02-01-page1.txt: [('stonebythefingerofGod', 'stone by the finger of God'), ('wasthefirstwritten', 'was the first written')]
GS18870115-V02-02-page3.txt: [('stateratherthanoft', 'state rather than oft'), ('Probablytheruleofou', 'Probably the rule of o u'), ('forthechildrenofwantarein', 'for the children of want are in')]
GS18870315-V02-06-page8.txt: [('whosVlawwehavetransgrossed', 'who s V law we have trans gross ed')]
GS18870515-V02-10-page7.txt: [('worshipcongregational', 'worship congregational')]
GS18870615-V02-12-page1.txt: [('thefirstdayofApriliscelebrated', 'the first day of April is celebrated')]
GS18870615-V02-12-page6.txt: [('tieforhispiallnowdsJ', 'tie for his pi all now d s J')]
GS18870701-V02-13-page5.txt: [('fortheotherwecanbut', 'for the other we can but'), ('apostlesthemselves', 'apostles themselves')]
GS18870801-V02-15-page7.txt: [('bethoughtthemselveswhatagrandthingitwouldbe', 'be thought themselves what a grand thing it would be'), ('encouragingreports', 'encouraging reports')]
GS18870801-V02-15-page8.txt: [('SitcaliSailitarium', 'Sit cal iS ail it ar i u m'), ('forinterpretatitan', 'for interpret a titan')]
GS18870815-V02-16-page3.txt: [('thesunandtherendingofthe', 'the sun and the rend ing of the'), ('whoshallchangeourvilebyo', 'who shall change our vile by o')]
GS18870901-V02-17-page4.txt: [('AndifSatancastoutSatan', 'And if Satan cast out Satan'), ('willnotevenhavethecompanionshipofthosewho', 'will not even have the companionship of those who'), ('thatisintheveryself', 'that is in the very self')]
GS18870915-V02-18-page8.txt: [('temporalmillennium', 'temporal millennium')]
GS18871101-V02-21-page1.txt: [('exampleofbaseringratitudeorofgreaterinconsistency', 'example of baser ingratitude or of greater inconsistency')]
GS18871101-V02-21-page6.txt: [('Butimagineoursurprise', 'But imagine our surprise'), ('uponlookingoverthe', 'upon looking over the')]
GS18871115-V02-22-page2.txt: [('goodpeopleshouldtryto', 'good people should try to'), ('whoshalljudgethequickandthedeadat', 'who shall judge the quick and the dead at')]
GS18871201-V02-23-page8.txt: [('TherewasaUniversalistupon', 'There was a Universalist upon')]
GS18880201-V03-03-page6.txt: [('vtothiveetabsletse', 'v tot hive eta b s lets e')]
GS18880215-V03-04-page6.txt: [('writingintheIndependentconcerningtheeRomanCatholic', 'writing in the Independent concerning thee Roman Catholic')]
GS18880315-V03-06-page3.txt: [('consciencestricken', 'conscience stricken')]
GS18880615-V03-12-page3.txt: [('theLordJesusChristsiemlf', 'the Lord Jesus Christs i e m l f')]
GS18880801-V03-15-page8.txt: [('advancedforadopting', 'advanced for adopting')]
GS18880915-V03-18-page1.txt: [('themanisdrawntocome', 'the man is drawn to come')]
GS18881015-V03-20-page4.txt: [('WhatanoversightintheLord', 'What an over sight in the Lord'), ('benoreasontodoubtthattheyknowthatmanyoftheOatholic', 'be no reason to doubt that they know that many of the Oath o l i c')]
GS18881015-V03-20-page5.txt: [('ThepromisesofGodwillallberedeemed', 'The promises of God will all be redeemed')]
GS18881101-V03-21-page1.txt: [('supremelovetoGodandequallovetoourfellowthemythicgorgeousnessofgenuineoldheathenism', 'supreme love to God and equal love to our fellow the myth i c gorgeous ness of genuine old heathenism')]
GS18881101-V03-21-page4.txt: [('ChristianInstructed', 'Christian Instructed'), ('dayofrestandholiness', 'day of rest and holiness'), ('andsubstitutesSundayinitsplace', 'and substitutes Sunday in its place'), ('withthefourthcommandment', 'with the fourth commandment')]
In [34]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GS/correction7

Average verified rate: 0.9762387961387358

Average of error rates: 0.024405357142857142

Total token count: 1275230

In [35]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[35]:
[("'", 2240),
 ('t', 758),
 ('d', 745),
 ('e', 535),
 ('w', 522),
 ('r', 483),
 ('f', 403),
 ('m', 385),
 ('g', 332),
 ('n', 328),
 ('th', 241),
 ("of'", 218),
 ('u', 200),
 ('ex', 193),
 ("the'", 178),
 ('x', 159),
 ('pp', 119),
 ('aro', 110),
 ('re', 105),
 ('eze', 75),
 ("and'", 66),
 ('k', 66),
 ('ti', 61),
 ('wo', 59),
 ('bo', 57),
 ('q', 53),
 ("to'", 49),
 ('ots', 48),
 ('tion', 44),
 ('ft', 44),
 ('mo', 41),
 ("''", 41),
 ('co', 40),
 ("in'", 40),
 ('wm', 40),
 ('mt', 37),
 ('nd', 36),
 ('ment', 32),
 ("saints'", 30),
 ('mal', 29),
 ("that'", 29),
 ('pa', 29),
 ('al', 29),
 ('goapxl', 27),
 ('ay', 27),
 ('iu', 27),
 ('se', 27),
 ('il', 26),
 ('es', 25),
 ('gospxi', 24)]
In [ ]: