LB-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "LB"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/baseline

Average verified rate: 0.9573822478765006

Average of error rates: 0.054559093619558734

Total token count: 5224513

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 500 )
Out[11]:
[('¥', 4001),
 ('m', 3562),
 ('-', 3386),
 ('e', 3325),
 ("'", 3217),
 ('d', 3161),
 ('w', 2673),
 (')', 2624),
 ('re-', 2372),
 ('con-', 2294),
 ('in-', 1955),
 ('tion', 1892),
 ('t', 1880),
 ('be-', 1710),
 ('n', 1653),
 ('r', 1344),
 ('*', 1295),
 ('f', 1195),
 ('ex-', 1097),
 ('¥¥', 1084),
 ('x', 1046),
 ('de-', 1026),
 ('com-', 972),
 ('ñ', 919),
 ('g', 895),
 ('mis-', 875),
 ('+', 853),
 ('dis-', 852),
 (']', 851),
 ('co', 833),
 ('ment', 819),
 ('en-', 668),
 ("workingmen's", 621),
 ('sub-', 619),
 ('un-', 615),
 ('per-', 597),
 ("prisoners'", 553),
 ('ñed', 523),
 ('tions', 507),
 ('k', 503)]

Check Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)[:500]
Out[12]:
[('¥', 4001),
 (')', 2624),
 ('*', 1295),
 ('¥¥', 1084),
 ('ñ', 919),
 ('+', 853),
 (']', 851),
 ('ñed', 523),
 ('(', 495),
 ('/', 425),
 ('_', 369),
 ('%', 332),
 ('(from', 247),
 ('•', 225),
 ('ñthe', 224),
 ('ö', 192),
 ('**', 164),
 ('ñen', 162),
 ('¥¥¥', 156),
 ('ña', 150),
 ('••', 145),
 ('[the', 130),
 ('ã', 129),
 ('(matt', 118),
 ('=', 117),
 ('(ps', 109),
 ('(isa', 108),
 ('(a)', 100),
 ('(for', 98),
 ('(the', 96),
 ('__', 95),
 ('ñi', 95),
 ('(c)', 93),
 ('(b)', 92),
 ('(john', 87),
 ('ñdan', 84),
 ('++', 81),
 ('[we', 77),
 ('(d)', 76),
 ('%x', 71),
 ('(luke', 70),
 ('chicagoñtrains', 68),
 ('(rom', 66),
 ('only)', 65),
 ('\\', 65),
 ('❑', 63),
 ('ñeditor', 60),
 ('(i', 59),
 ('[miss', 59),
 ('ñthat', 58),
 ('`', 56),
 ('¥¥¥¥', 50),
 ('(prov', 50),
 ('~~', 50),
 ('¡', 50),
 ('[mrs', 49),
 ('(gen', 49),
 ('(see', 48),
 ('ñit', 47),
 ('ô', 45),
 ('(acts', 45),
 ('ñselected', 45),
 ('[', 45),
 ('\ufeff', 44),
 ('ñand', 44),
 ('(heb', 44),
 ('____', 44),
 ('penitentiary)', 43),
 ('[mr', 43),
 ('[this', 42),
 ('minutesñ', 42),
 ('#', 41),
 ('minutesñfifty', 41),
 ('laundryñin', 39),
 ('(or', 38),
 ('¥+¥', 38),
 ('(and', 38),
 ('(rev', 36),
 ('ñto', 36),
 ('wantedñto', 35),
 ('(not', 34),
 ('[a', 34),
 ('[dr', 33),
 ('(continued', 33),
 ('ñno', 33),
 ('(verse', 33),
 ('(with', 32),
 ('+++', 32),
 ('i¥', 32),
 ('(jer', 31),
 ('___', 31),
 ('(phil', 31),
 ('(job', 31),
 ('(a', 31),
 ('also)', 30),
 ('lunches)', 30),
 ('(penny', 30),
 ('ofñ', 30),
 ('(concluded', 30),
 ('ñrev', 30),
 ('ñfifty', 29),
 ('appliancesñ', 28),
 ('(that', 28),
 ('ñnot', 28),
 ('¥+', 28),
 ('prison)', 28),
 ('(eccl', 26),
 ('ñwe', 26),
 ('ñin', 26),
 ('(to', 26),
 ('(gal', 25),
 ('ñnatural', 25),
 ('themñnot', 25),
 ('[brother', 25),
 ('(mark', 25),
 ('ñhe', 24),
 ('è', 23),
 ('chicago)', 23),
 ('nowñ', 23),
 ('i)', 23),
 ('÷', 23),
 ('-¥', 22),
 ('ñone', 22),
 ('ñbut', 22),
 ('c¥', 22),
 ('(entered', 21),
 ('(one', 21),
 ('wrappingñno', 21),
 ('ç', 21),
 ('ñfor', 21),
 ('`i', 21),
 ('ñfrom', 21),
 ('++++', 20),
 ('ñisa', 20),
 ('ñall', 20),
 ('ñjohn', 20),
 ('ñmatt', 20),
 ('ò', 20),
 ('<', 19),
 ('(numbers', 19),
 ('(incorporated)', 18),
 ('desiredñone', 18),
 ('`the', 18),
 ('ñat', 18),
 ('ñhow', 18),
 ('[from', 18),
 ('ñjust', 18),
 ('(lid', 18),
 ('writtenñthe', 18),
 ('(james', 18),
 ('•••', 18),
 ('ñchicago', 18),
 ('(this', 18),
 ('(mich', 18),
 ('(as', 17),
 ('¥¥¥¥¥', 17),
 ('comeñthe', 17),
 ('¥*', 17),
 ('[some', 17),
 ('ñthis', 17),
 ('(in', 16),
 ('______', 16),
 ('ñfamous', 16),
 ('`to', 16),
 ('+++++', 16),
 ('_____', 16),
 ('`if', 16),
 ('l¥', 16),
 ('/x', 16),
 ('(dan', 15),
 ('+¥', 15),
 ('}', 15),
 ('*abstract', 15),
 ('ó', 15),
 ('year)', 15),
 ('ñbefore', 15),
 ('[for', 15),
 ('(t', 15),
 ('¥a', 15),
 ('[in', 15),
 ('+++++++', 15),
 ('fiftyñ', 14),
 ('~', 14),
 ('ñw', 14),
 ('¥¥)¥', 14),
 ('(joel', 14),
 ('(cut', 14),
 ('ñsome', 14),
 ('¥i', 14),
 ('t¥', 14),
 ('♦', 14),
 ('}editorial', 14),
 ('(if', 13),
 ('ñthere', 13),
 ('[every', 13),
 ('sizes)', 13),
 ('¥the', 13),
 ("¥'", 13),
 ('appliancesñall', 13),
 ('ñps', 13),
 ('twenty-fiveñ', 13),
 ('homeñthe', 13),
 ('(which', 12),
 ('ñnever', 12),
 ('and¥', 12),
 ('++++++', 12),
 ('(ex', 12),
 ('[on', 12),
 ('*report', 12),
 ('oil)', 12),
 ('¥t', 12),
 ('the¥', 12),
 ('***', 12),
 ('(x', 12),
 ('a¥', 12),
 ('[it', 12),
 ('ñdr', 12),
 ('ñas', 12),
 ('[our', 12),
 ('taken)', 12),
 ('ñif', 12),
 ('of¥', 11),
 ('ñonly', 11),
 ('[about', 11),
 ('[last', 11),
 ('¥-¥', 11),
 ('ñyou', 11),
 ('(col', 11),
 ('ñabout', 11),
 ('(ill', 11),
 ('(eph', 11),
 ('california)', 11),
 ('ñwhat', 11),
 ('(wis', 11),
 ('+++++++++++++++++++++++++++', 11),
 ('*from', 11),
 ('ñby', 11),
 ('to¥', 11),
 ('¥and', 10),
 ('ñthen', 10),
 ('(agents', 10),
 ('ñthey', 10),
 ('ñsan', 10),
 ('ñluke', 10),
 ('ñmrs', 10),
 ('it¥', 10),
 ('+++++++++', 10),
 ('it)', 10),
 ('ñdenver', 10),
 ('(deut', 10),
 ('ñso', 10),
 ('chicagoñ', 10),
 ('[during', 10),
 ('(at', 10),
 ('¦', 10),
 ('translation)', 10),
 ('continued)', 10),
 ('ñis', 10),
 ('¥to', 10),
 ('^', 10),
 ('(two', 10),
 ('[as', 10),
 ('[recently', 10),
 ('(it', 10),
 ('¥in', 9),
 ('itñ', 9),
 ('(care-', 9),
 ('(now', 9),
 ('\\v', 9),
 ('(e)', 9),
 ('(hosea', 9),
 ('*talk', 9),
 (')¥', 9),
 ('(is', 9),
 ('has¥', 9),
 ('meñ', 9),
 ('(related', 9),
 ('i*', 9),
 ('(our', 9),
 ('(continued)', 9),
 ('(inc', 9),
 ('nowñbefore', 9),
 ('[when', 9),
 ('(mal', 9),
 ('¥¥¥¥¥¥', 9),
 ('¥¥x¥¥', 9),
 ('twenty=five', 9),
 ('well=equipped', 9),
 ('¥¥x¥', 9),
 ('(psa', 9),
 ('ñyour', 9),
 ('*¥', 9),
 ('ñtuition', 9),
 ('ñan', 9),
 ('(he', 9),
 ('ñe', 9),
 ("'¥", 8),
 ('+ö', 8),
 ('ñconsecrated', 8),
 ('[there', 8),
 ('`what', 8),
 ('ñour', 8),
 ('ñyet', 8),
 ('(sears', 8),
 ('-_', 8),
 ('ñeither', 8),
 ('`no', 8),
 ('(next', 8),
 ('¥¥¥¥¥¥¥¥', 8),
 ('ñlincoln', 8),
 ('ñportland', 8),
 ('(jas', 8),
 ('ñhis', 8),
 ('number)', 8),
 ('e¥', 8),
 ('laundryñ', 8),
 ('ñbutte', 8),
 ('ñnow', 8),
 ('girls)', 8),
 ('version)', 8),
 ('nurse)', 8),
 ('ñlet', 8),
 ('ñmission', 8),
 ('==', 8),
 ('prisoner)', 8),
 ('—', 8),
 ('(r', 8),
 ('[several', 8),
 ('ñrom', 8),
 ('[not', 8),
 ('ñcleveland', 8),
 ('+++++++++++++++', 8),
 ('(so', 8),
 ('[at', 8),
 ('(who', 8),
 ('ñthose', 8),
 ('ñspokane', 7),
 ('(member', 7),
 ('ñacts', 7),
 ('t_', 7),
 ('[years', 7),
 ('modelñno', 7),
 ('cents)', 7),
 ('inñ', 7),
 ('lifeñto', 7),
 ('*-', 7),
 ('machineñwith', 7),
 ('**********************************', 7),
 ('>', 7),
 ('(testimony', 7),
 ('(she', 7),
 ('sinñ', 7),
 ('(taken', 7),
 ('ñgood', 7),
 ('name)', 7),
 ('rememberñ', 7),
 ('class)', 7),
 ('(note', 7),
 ('ñmark', 7),
 ('_______', 7),
 ('ñdavid', 7),
 ('¥¥¥¥¥¥¥', 7),
 ('ñbrooklyn', 7),
 ('•+', 7),
 ('ñmy', 7),
 ('beñin', 7),
 ('*the', 7),
 ('(jno', 7),
 ('ñguthrie', 7),
 ('¥i¥', 7),
 ('++++++++', 7),
 ('a)', 7),
 ('ñevansville', 7),
 ('ñchattanooga', 7),
 ('he¥', 7),
 ('i_', 7),
 ('ñyes', 7),
 ('usñ', 7),
 ('[extracts', 7),
 ('¥¥)', 7),
 ('(by', 7),
 ('planñpay', 7),
 ('voices)', 7),
 ('ñheb', 7),
 ('++++++++++', 7),
 ('sirñi', 7),
 ('(eze', 7),
 ('everywhereñpeople', 7),
 ('ñh', 6),
 ('ñdo', 6),
 ('packsñany', 6),
 ('byñ', 6),
 ('¥¥¥¥¥¥¥¥¥', 6),
 ('muskokañ', 6),
 ('years)', 6),
 ('`¥', 6),
 ('menñ', 6),
 ('¥of', 6),
 ("(a')", 6),
 ('itñthat', 6),
 ('(j', 6),
 ('ñfree', 6),
 ('ñhaving', 6),
 ('wantedñ', 6),
 ('filmsñv', 6),
 ('coã', 6),
 ('(extracts', 6),
 ('•+•', 6),
 ('lifeñthe', 6),
 ('homeña', 6),
 ('\\ve', 6),
 ('themñ', 6),
 ('(even', 6),
 ('ñwho', 6),
 ('workñthe', 6),
 ('*a', 6),
 ('mindñ', 6),
 ('typewriterñthe', 6),
 ('(ready', 6),
 ('upñon', 6),
 ('-*', 6),
 ('*told', 6),
 ('¥s', 6),
 ('ñtwo', 6),
 ('deptã', 6),
 ('is¥', 6),
 ('[pastor', 6),
 ('(sore', 6),
 ('ñwas', 6),
 ('`yes', 6),
 ('(expense)', 6),
 ('(we', 6),
 ('ñwhich', 6),
 ('(lev', 6),
 ('doñ', 6),
 ('training=school', 6),
 ('(dent', 6),
 ('(f)', 6),
 ('wantedña', 6),
 ('(god', 6),
 ('ñwith', 6),
 ('dayñ', 6),
 ('ñyours', 6),
 ('[many', 6),
 ('++++++++++++++', 6),
 ('ñmany', 6),
 ('ö+', 6),
 ('*+', 6),
 ('earlyñnow', 6),
 ('=nt', 6),
 ('t)', 6),
 ('+++++++++++', 6),
 ('_¥', 6),
 ('periodsñ', 6),
 ('ñheard', 6),
 ('handñ', 6),
 ('*lumber', 6),
 ('street)', 6),
 ('month)', 6),
 ('godñ', 6),
 ('workñed', 6),
 ('existenceñthe', 6),
 ('a_', 6),
 ('(of', 6),
 ('ill¥', 6),
 ('/-', 6),
 ('awayñ', 5),
 ('`go', 5),
 ('ñof', 5),
 ('/ix', 5),
 ('forñto', 5),
 ('*t', 5),
 ('`come', 5),
 ('e/', 5),
 ('ñend', 5),
 ('songñ', 5),
 ('*(talk', 5),
 ('ñperhaps', 5),
 ('ñpsa', 5),
 ('[free', 5),
 ('ñsuch', 5),
 ('¥s¥', 5),
 ('choice)', 5),
 ('ã-', 5),
 ('(romans', 5),
 ('christñmrs', 5),
 ('ñprov', 5),
 ('placeñthe', 5),
 ('(abstract', 5),
 ('typewritersñthe', 5),
 ('+++++++++++++', 5),
 ('(lays', 5),
 ('floor)', 5),
 ('(money', 5),
 ('`my', 5),
 ('paidñor', 5),
 ('++++++++++++', 5),
 ('eñ', 5),
 ('••••', 5),
 ('[those', 5)]

Correction 1 -- Normalize Characters

In [13]:
# %load shared_elements/normalize_characters.py
prev = 'baseline'
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction1

Average verified rate: 0.9635807467075778

Average of error rates: 0.04575790101371498

Total token count: 5213259

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 3638),
 ('m', 3600),
 ('e', 3417),
 ("'", 3337),
 ('d', 3272),
 ('w', 2711),
 ('re-', 2374),
 ('con-', 2295),
 ('t', 2024),
 ('in-', 1957),
 ('tion', 1900),
 ('be-', 1712),
 ('n', 1690),
 ('r', 1413),
 ('f', 1244),
 ('x', 1239),
 ('ex-', 1098),
 ('de-', 1027),
 ('com-', 973),
 ('g', 925),
 ('mis-', 876),
 ('dis-', 853),
 ('co', 845),
 ('ment', 821),
 ('en-', 669),
 ("workingmen's", 622),
 ('sub-', 619),
 ('un-', 615),
 ('per-', 599),
 ("prisoners'", 554),
 ('k', 519),
 ('tions', 508),
 ('some-', 495),
 ('th', 480),
 ('ac-', 454),
 ('im-', 453),
 ('ers', 452),
 ('to-', 446),
 ('ful', 433),
 ('chris-', 429),
 ('al-', 426),
 ('soul-winning', 426),
 ('oo', 422),
 ('pro-', 419),
 ('ad-', 416),
 ('an-', 402),
 ('ments', 399),
 ('pre-', 366),
 ('for-', 354),
 ('ap-', 347)]

Correction 2 -- Correct Line Endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = cycle
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction2

Average verified rate: 0.9828071372542159

Average of error rates: 0.02732128801431127

Total token count: 5143995

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('m', 3596),
 ('-', 3572),
 ('e', 3414),
 ("'", 3337),
 ('d', 3267),
 ('w', 2711),
 ('t', 2008),
 ('n', 1687),
 ('r', 1404),
 ('x', 1238),
 ('f', 1238),
 ('g', 921),
 ('co', 831),
 ("workingmen's", 684),
 ("prisoners'", 589),
 ('k', 517),
 ('th', 479),
 ('soul-winning', 435),
 ('oo', 422),
 ('u', 297),
 ("'i", 226),
 ('red-letter', 220),
 ('--', 214),
 ('hsi', 213),
 ("'the", 210),
 ('anti-cigarette', 173),
 ('halsted', 164),
 ('z', 162),
 ("'tis", 160),
 ('wm', 158),
 ('mo', 158),
 ('rd', 141),
 ('pa', 138),
 ('seven-jeweled', 134),
 ('oc', 132),
 ('broken-hearted', 118),
 ('ti', 110),
 ('ft', 105),
 ('q', 91),
 ('harner', 91),
 ('sub-', 89),
 ('izer', 87),
 ('vapor-', 87),
 ('li', 87),
 ('con-', 86),
 ('pp', 84),
 ('mcauley', 83),
 ('al', 83),
 ('pavlson', 81),
 ('stapp', 80)]

Correction 3 -- Remove Extra Dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = cycle
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction3

Average verified rate: 0.9844966635129426

Average of error rates: 0.02491568276684556

Total token count: 5144183

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[('m', 3607),
 ('e', 3441),
 ("'", 3378),
 ('d', 3292),
 ('w', 2726),
 ('t', 2047),
 ('n', 1700),
 ('r', 1435),
 ('x', 1275),
 ('f', 1251),
 ('g', 928),
 ('co', 885),
 ("workingmen's", 684),
 ("prisoners'", 589),
 ('k', 532),
 ('th', 482),
 ('oo', 423),
 ('soul-winning', 348),
 ('u', 309),
 ("'i", 227),
 ('hsi', 213),
 ("'the", 210),
 ('red-letter', 190),
 ('z', 166),
 ('halsted', 164),
 ("'tis", 160),
 ('wm', 159),
 ('mo', 159),
 ('rd', 141),
 ('pa', 141),
 ('anti-cigarette', 138),
 ('oc', 133),
 ('ex', 131),
 ('re', 125),
 ('ti', 120),
 ('ft', 106),
 ('broken-hearted', 101),
 ('seven-jeweled', 96),
 ('q', 95),
 ('harner', 91),
 ('al', 90),
 ('li', 89),
 ('izer', 87),
 ('-', 87),
 ('pp', 84),
 ('mcauley', 83),
 ('pavlson', 81),
 ('stapp', 80),
 ("'to", 75),
 ('ky', 75)]

Correction 4 -- Remove Extra Quotation Marks

In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = cycle
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 1:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction4

Average verified rate: 0.9852831084376651

Average of error rates: 0.02378878950506858

Total token count: 5144089

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[('m', 3611),
 ('e', 3456),
 ('d', 3295),
 ("'", 3226),
 ('w', 2728),
 ('t', 2079),
 ('n', 1707),
 ('r', 1441),
 ('x', 1280),
 ('f', 1260),
 ('g', 930),
 ('co', 888),
 ("workingmen's", 678),
 ("prisoners'", 585),
 ('k', 533),
 ('th', 482),
 ('oo', 428),
 ('soul-winning', 348),
 ('u', 310),
 ('hsi', 213),
 ('red-letter', 190),
 ('z', 169),
 ('halsted', 164),
 ('wm', 160),
 ('mo', 159),
 ('rd', 142),
 ('pa', 141),
 ('anti-cigarette', 138),
 ('oc', 133),
 ('ex', 131),
 ('re', 130),
 ('ti', 120),
 ('ft', 107),
 ('broken-hearted', 101),
 ('q', 97),
 ('seven-jeweled', 96),
 ('al', 93),
 ('harner', 91),
 ('li', 90),
 ('izer', 87),
 ('-', 87),
 ('pp', 84),
 ('mcauley', 83),
 ('pavlson', 81),
 ('stapp', 80),
 ('ky', 75),
 ('soulwinning', 74),
 ('ninety-six', 74),
 ('pearsons', 72),
 ('good-bye', 70)]

Correction 5 -- Rejoin Burst Words

In [25]:
# %load shared_elements/rejoin_burst_words.py
prev = cycle
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction5

Average verified rate: 0.9852801583159106

Average of error rates: 0.023788431723315448

Total token count: 5144145

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[('m', 3610),
 ('e', 3451),
 ('d', 3295),
 ("'", 3226),
 ('w', 2728),
 ('t', 2080),
 ('n', 1706),
 ('r', 1437),
 ('x', 1280),
 ('f', 1259),
 ('g', 930),
 ('co', 888),
 ("workingmen's", 678),
 ("prisoners'", 585),
 ('k', 533),
 ('th', 482),
 ('oo', 428),
 ('soul-winning', 348),
 ('u', 309),
 ('hsi', 213),
 ('red-letter', 190),
 ('z', 169),
 ('halsted', 164),
 ('wm', 160),
 ('mo', 159),
 ('rd', 142),
 ('pa', 141),
 ('anti-cigarette', 138),
 ('oc', 133),
 ('ex', 131),
 ('re', 131),
 ('ti', 120),
 ('ft', 107),
 ('broken-hearted', 101),
 ('q', 97),
 ('seven-jeweled', 96),
 ('al', 93),
 ('harner', 91),
 ('li', 90),
 ('izer', 87),
 ('-', 87),
 ('pp', 84),
 ('mcauley', 83),
 ('pavlson', 81),
 ('stapp', 80),
 ('ky', 75),
 ('soulwinning', 74),
 ('ninety-six', 74),
 ('pearsons', 72),
 ('good-bye', 70)]

Correction 6 -- Rejoin Split Words

In [28]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction6

Average verified rate: 0.9855275460757164

Average of error rates: 0.023463208109719743

Total token count: 5143219

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[('m', 3608),
 ('e', 3419),
 ('d', 3290),
 ("'", 3226),
 ('w', 2726),
 ('t', 2072),
 ('n', 1698),
 ('r', 1425),
 ('x', 1280),
 ('f', 1257),
 ('g', 925),
 ('co', 837),
 ("workingmen's", 678),
 ("prisoners'", 585),
 ('k', 533),
 ('th', 464),
 ('oo', 415),
 ('soul-winning', 348),
 ('u', 307),
 ('hsi', 213),
 ('red-letter', 190),
 ('z', 169),
 ('halsted', 164),
 ('wm', 160),
 ('mo', 155),
 ('rd', 141),
 ('anti-cigarette', 138),
 ('pa', 136),
 ('oc', 133),
 ('ex', 113),
 ('ft', 107),
 ('broken-hearted', 101),
 ('q', 97),
 ('ti', 97),
 ('seven-jeweled', 96),
 ('harner', 91),
 ('izer', 87),
 ('-', 87),
 ('pp', 84),
 ('mcauley', 83),
 ('re', 82),
 ('pavlson', 81),
 ('stapp', 80),
 ('al', 78),
 ('ky', 75),
 ('soulwinning', 74),
 ('ninety-six', 74),
 ('pearsons', 72),
 ('li', 71),
 ('good-bye', 70)]

Correction 7 -- Rejoin Split Words II

In [31]:
# %load shared_elements/rejoin_split_words.py
prev = cycle
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [32]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction7

Average verified rate: 0.9856858140213685

Average of error rates: 0.023234227787716163

Total token count: 5142451

In [33]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[33]:
[('m', 3593),
 ('e', 3408),
 ('d', 3287),
 ("'", 3226),
 ('w', 2715),
 ('t', 2056),
 ('n', 1693),
 ('r', 1397),
 ('x', 1280),
 ('f', 1247),
 ('g', 925),
 ('co', 837),
 ("workingmen's", 678),
 ("prisoners'", 585),
 ('k', 533),
 ('th', 458),
 ('oo', 387),
 ('soul-winning', 348),
 ('u', 307),
 ('hsi', 213),
 ('red-letter', 190),
 ('z', 167),
 ('halsted', 164),
 ('wm', 160),
 ('mo', 154),
 ('rd', 141),
 ('anti-cigarette', 138),
 ('pa', 136),
 ('oc', 133),
 ('ex', 112),
 ('ft', 102),
 ('broken-hearted', 101),
 ('q', 97),
 ('seven-jeweled', 96),
 ('harner', 91),
 ('-', 87),
 ('mcauley', 83),
 ('pp', 82),
 ('pavlson', 81),
 ('stapp', 80),
 ('ti', 79),
 ('al', 76),
 ('ky', 75),
 ('soulwinning', 74),
 ('ninety-six', 74),
 ('pearsons', 72),
 ('good-bye', 70),
 ('li', 70),
 ('heart-broken', 69),
 ('mt', 68)]

Review Remaining Errors

In [34]:
reports.long_errors(errors_summary, min_length=20)
Out[34]:
(['nittlitiltlitlitiltlitivtimmttim',
  'ageorsisommewesseesomsemowirmarnannwireormow',
  'tellwhenyoushippedthem',
  '------------------------------------------------------------------------',
  'foefbsomateanodnehawshomhisassedsuabnsycribed',
  'inhganedhsmamaegatozianey',
  'three-thousand-dollar',
  'toanswerlettersofinquiryfromeven',
  'hereweeseetheeseetheetherehe',
  'ininillimilligilninminine',
  'tarlgienrsegauniljeretaz',
  'cutthisadvertisementoutandsendtous',
  'afflivrtiiimorryamorwareswar',
  'maiiiiiiiiiiiiimmolluminmitilluniiiinnuiliifntimma',
  'emzerrialifkacesmlaiewn',
  'miavvaiminompantoakinmainmavwvit',
  'sionnuoioneenonomauevounownr',
  'pthcaotuldhappentothereformeddrunkardis',
  "andwewillforwardyouabo'x",
  'friaresmririiftwayvbro',
  'aosoilyreowswomozirmaanarrierionnewisresiwowzrowiwkrooisrowareci',
  'esireetheresibeisietesereeefee',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii',
  'atndaddmiirrattionfro',
  'illustrationsaccompanying',
  'hejeeemereeheeeeezieeeeiseie',
  'timellmodoteetnih-enoffvaml',
  'erismmipprwsimosiiminries',
  'lininlemmemeelmlnliimmminmelnl',
  'raftronweinwrovrawrivrtuf',
  'rrelirmitrailtrecillarict',
  'never-to-be-forgotten',
  'aytintattintnirnfefliiatyai',
  "for'yaoursupbbscriptiotn",
  'attflisayfromttaitatwv',
  'sleetbhreaigtheedsntegraedeilinestreumeeg',
  'solidquartersawedoakdrop',
  'eeestemeseeeeseeezemeem',
  'givesainllustrateddescription',
  'immiiimiiiiimiuminiiimi',
  'intelligent-appearing',
  'seventy-three-hundredths',
  'saorfethoenceylnesbera',
  'inkiemmimmitiamcminemissem',
  'good-forwhat-ails-you',
  'heereeemeeteheeheeeemeefeeeeeetee',
  'mmisviumummssugratrat',
  'eleven-thousand-dollar',
  'imuusumnimmilmitsmmmigulnumommimmituimonnzmtimitimuimmillimmitnmmuntm',
  'once-a-thief-always-one',
  'agifivnifimmimmaiipaminvimpf',
  'wfitevisvaviamasimmptamimark',
  'classifiegarrangement',
  'setsoforchestralroneesdonatorypipequalityreeds',
  'iiiiiiiiiiiiiiiiiiiiii',
  'oarzemwsrissovoivorwriteistar',
  'biscuit-steak-waffles',
  'ertrobfusgahnittsaraiunmtafroiuodmsff',
  'tnaaawainaaaaapowaaaaaaaaaaaaaa',
  'icittrittottvitammeatericitcbtv',
  'orrocomneorwomoirdrosegl',
  'doublethecirculationof',
  'waimmiiamitzimmiummimminumtzaala',
  'alsoallnecessarytools',
  'alwowitowtomazaamisfotagcceicwomweigzerfavi',
  'nartorysigrearliyrrionssiviairmrsorio',
  'theynneeitihtherbecomelonesome',
  'tailtailtritclimiairactlrairrinittiarigs',
  "don'tdelayyourorderadayour",
  '---------------------------------------------------------------------------',
  'vszvosprwvxvovorsavrorsx',
  'foronlytwonewsubscriptionswewillsend',
  'tooliffeiratwooptionally',
  'isienivailowrosenraneamireourowywe',
  'typhoid-fever-infected',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii',
  'hygienically-prepared',
  'inventsomesatisfactorysubstituteforthe',
  'efeeireeeseeeefeemeeie',
  'iiiiiiiiiiiiiiiiiiiiiialazil',
  'iseeemeheiveheighibeeire',
  'kaaallquilmsaaimsumsavamp',
  'ouracmejewelandacmeprinceare',
  'yeanilriatiesitlidtrivaricitaraclrricarignixlmactalretreic',
  'raaawaiaaaaanammaaaaaawaaaaanynamaaaaaawaaaaaaaaaamanyvv',
  'hiiihhaliiihilillihheeileilhillh',
  'wheanndspweicsuelatniodnsieatihnice',
  'miliiiiiiiiiiiilittiiimilinniniiiiiiiiiiimiiimiiiiiiiniiiiitiliittiltiiiiiiillnig',
  'godsolovedtheworldthathe',
  'thehinsdalesanitarium',
  'yearshasbeenconnectedwiththeruralin',
  'improvementassoonasprovidenceopensthe',
  'ashrdigelviviphmeeinsiohrjrrtcleroatlitior',
  'iiiiiiiiiiiiiiiilliliiiiiiiiil',
  'ahrlitgosfrtymirmilpfwtre',
  'cmwycaowiceciaaawakffieieriewasaw',
  'iummuumammminiumnimmthimmumummmus',
  'soul-and-bodydestroying',
  'ittaattitaittlitgaqtlitlelltitt',
  'wrootearmishorionacianowirwoor',
  'paulson----businessmanager',
  'filowstimdvalgaatietitoafbah',
  'sanitariumbyeqsuciieppnecde',
  'uminimmluuminummilmumstmutitttmmutmunitittatttuuumuutututtuutututtailm',
  'physicallrighteousness',
  'iseemeeseemesiemesseemeemin',
  'theparlorgemactionwconiseisitasroef',
  'efeakaegnarreilfilvbfrgip',
  'fueirpnaitsehidepderame',
  'iiiiiiiiiiiiiiiiiiiiiii',
  'eeeeeeeehteeereeeeeeeeeeeej',
  'fammininieriminimerrimieerief',
  'iiiiiimmiiiiiimiffillinliiiiiiiiiiiiiiiiiiiiiimiiiii',
  'iiiiiiiiiiiupinllnlpllpnn',
  "p'aftllerawleios'vfia",
  'agrormsecoserosnowrzoire',
  'wetrdamvedsweetesttoned',
  'tictleammeramssicticommyemmarommramemy',
  'jrjppppileirwrirwriargfmrrezej',
  'iiiiiiiiiiiiiiiiiiiiiiiiiiiii',
  'eizeimeieheeeeiberrreireherefereees',
  'butthelordjesuschristiswithmeanaldie',
  'grfoamlfrottrottromivmposrommoraittogi',
  'lerigeargalfieloawilmefazinainefizwzmnruca',
  'emsceesnmeemesesestroeseso',
  'zamamammmmmfkgaamwmavaa',
  'yareardinweimpropraisrpors',
  'ortmortmacstmartancsareme',
  'boatatfiftycentseachwoneewillo',
  'rozromwasiusurnacomprosorwasimmoommr',
  'iiiiiimiiiiiiiimannamilii',
  'wawaffsagfayfansamamamapfiag',
  "toresublasrcrsiubbesrcsritpotigonooprdiche'e",
  'umummummumumummumminnummumn',
  'hundred-thousand-dollar-a-year',
  'alphabetical-arrangement',
  'vvvvvvvvvvvvvvvvvvvvvvvvvvvv',
  'gneelinnowseesareereareare',
  'andbolidisytriuneatilvl',
  'reiarozownnellimmoldnfroseoz',
  'calleirkildahbwasiloin',
  'huiltlurstarationshown',
  'frrrrrrrierrrfferrirfrfirrfrwieieeteieeei',
  'christiarpanumarmanao',
  'whimmiiiiiimimmummiiiiiimmimiiiiiiiiiiiiiiiimimmimiminie',
  'eiseeteeemberieseetee',
  'grariggagrowgnaleganarretagii',
  'firnirmirmwririmenjesi',
  'ouracmejewelandacmeprinceat',
  'inlininemineinininminemiliminemmemeignmle',
  'ltraltradtri-kawidommysiltactrigimirsilbaltrim',
  'isieheeseefeemeimemeemej',
  'featilowcwackacmirgieret',
  'donsenlaiibreifdistarrleing',
  'finvinittnrtrittintro',
  'mnviirialimmarinoinmorewsrell',
  'batesorfadverrtisning',
  'theseyearsthisnoblechristian',
  'theillustrationsbeingoneofmr',
  'imimininpupwimilinmiamimpsvipapvlimmiimmilammilwaimmium',
  'eemeremeemereethemeeeem',
  'inivaimimapavapmimpatalylt',
  'toborrowseveralthousanddollarsinsums',
  'iminnimumminumnimminemeir',
  'pirimpipripplispgaliarimmairrompssmstiaffierssargsisorwraimrs',
  'hasnialrebildsnigriettol',
  'blistagitiommawyamaparm',
  'mummuummumummummummummummummummummummumnumummummung',
  'meyeasdailyandfinditineverywaysatisfactory',
  'aweiewietzawmnewlwpmfakw',
  'ouracmejewelandacmeprince',
  'thisisthebestpremiumcameveroffered',
  'sesseommemwommeseemsesessommoomosemmeoposeoesseosimies',
  'stmmommmustmtwomputvp',
  'woraimarvarrifireatemanr',
  'fifeeineeeieereeemsor',
  'mniamimmlimummiainimmvittp',
  'rnnmunnmunnmmnnnmmmumnmmmmmunmmr',
  'memevomnewevawnwcawanikaliti',
  'turning-out-the-light',
  'sanitariumbyeqsuciiepnpecde',
  'imunummutmonnutenummiswitunimmums',
  'ewmaswasenimoweourawmereariwzromejtc',
  'thenilesbryantschoolofpianotuning',
  'trisltradrailtriariatial',
  'ooooooooooooooooooooo',
  'ozooalwargroserorrootoorer',
  'eirillelzlelzaztazyuciz',
  'worthylaborofloveinyourwill',
  'anatomico-pathological',
  'walvandociniperzvaedie',
  'pennsylvania-delaware',
  'mtmadaetnouafalitcoesreawho',
  "mountaineer'sfamilyreunion",
  'twenty-nine-million-dollar',
  "don'tdelayyourorderadayourspecial",
  'rimmumnummumummummumummuummumummummuumummuumummuumuma',
  'souland-body-destroying',
  'cirgeirgewmceffaceirtzamoinwi',
  'mammmawmimpananwiwairaanyarwawawainmwaphwnahwomiwawa',
  "hudsontaylor'sthrillingmissionarybook",
  'iiiiiiiiiiiiiiiiiiiiiiiiiii',
  "thisisthebestpremium'weeveroffered",
  'owicozeicivioewevoeizwcwigiccoatkiowavn',
  'eversmokedacigarette-withoutbeing',
  'air-eiazargslrzzrayakiwzdelrimwir',
  'gotimulliiiiiiiiiiiiiii',
  'heat-and-energy-producing',
  'atltlregtrallivallties',
  'iiiiiiiiiiiiiiiiiiiiiiiiii',
  'hiesebeeeseeeheisieszeiseheem',
  'obwicnyenlameplateatodaouble',
  'beenntstillsalfgritiqtridisbili',
  'zismovvitttottittintsal',
  'believethinhimshouldnotperish',
  'steieicvkvpfiggmffsfvkkvwmekzokvkvwsnnnlz'],
 20)
In [35]:
reports.docs_with_high_error_rate(summary)
Out[35]:
[('LB19200201-V23-02-page36.txt', 0.889),
 ('LB19200601-V23-06-page36.txt', 0.81),
 ('LB19070101-V10-01-page2.txt', 0.75),
 ('LB19190401-V22-04-page14.txt', 0.733),
 ('LB19071101-V10-11-page2.txt', 0.684),
 ('LB19071001-V10-10-page1.txt', 0.676),
 ('LB19040101-V07-01-page1.txt', 0.667),
 ('LB19040401-V07-04-page2.txt', 0.656),
 ('LB19030601-V06-06-page2.txt', 0.6),
 ('LB19000801-V03-06-page2.txt', 0.583),
 ('LB19070801-V10-08-page2.txt', 0.571),
 ('LB19101001-V13-10-page2.txt', 0.545),
 ('LB19020401-V05-04-page14.txt', 0.5),
 ('LB19121201-V15-12-page16.txt', 0.5),
 ('LB19050601-V08-06-page21.txt', 0.5),
 ('LB19020401-V05-04-page6.txt', 0.5),
 ('LB19010401-V04-02-page2.txt', 0.5),
 ('LB19020801-V05-08-page36.txt', 0.467),
 ('LB19120501-V15-05-page2.txt', 0.466),
 ('LB19190701-V22-07-page4.txt', 0.464),
 ('LB19070701-V10-07-page2.txt', 0.462),
 ('LB19021001-V05-10-page36.txt', 0.449),
 ('LB19030101-V06-01-page1.txt', 0.444),
 ('LB19101201-V13-12-page2.txt', 0.444),
 ('LB19020901-V05-09-page28.txt', 0.435),
 ('LB19080601-V11-06-page1.txt', 0.433),
 ('LB19050701-V08-07-page1.txt', 0.424),
 ('LB19010301-V04-01-page2.txt', 0.421),
 ('LB19181101-V21-11-page2.txt', 0.42),
 ('LB19041001-V07-10-page31.txt', 0.416),
 ('LB19070301-V10-03-page25.txt', 0.409),
 ('LB19050801-V08-08-page1.txt', 0.407),
 ('LB19020701-V05-07-page28.txt', 0.385),
 ('LB19041101-V07-11-page7.txt', 0.384),
 ('LB19070501-V10-05-page16.txt', 0.364),
 ('LB19151101-V18-11-page10.txt', 0.36),
 ('LB19060601-V09-06-page1.txt', 0.355),
 ('LB19030901-V06-09-page1.txt', 0.355),
 ('LB19061001-V09-10-page1.txt', 0.345),
 ('LB19050501-V08-05-page2.txt', 0.343),
 ('LB19181001-V21-10-page2.txt', 0.335),
 ('LB19191001-V22-10-page2.txt', 0.335),
 ('LB19200801-V23-08-page1.txt', 0.333),
 ('LB19070501-V10-05-page2.txt', 0.333),
 ('LB19050401-V08-04-page1.txt', 0.333),
 ('LB19060501-V09-05-page13.txt', 0.333),
 ('LB19040801-V07-08-page1.txt', 0.333),
 ('LB19010501-V04-03-page2.txt', 0.333),
 ('LB19050901-V08-09-page1.txt', 0.333),
 ('LB19020801-V05-08-page33.txt', 0.332),
 ('LB19030201-V06-02-page1.txt', 0.325),
 ('LB19040601-V07-06-page1.txt', 0.323),
 ('LB19040301-V07-03-page1.txt', 0.321),
 ('LB19060801-V09-08-page1.txt', 0.312),
 ('LB19060501-V09-05-page1.txt', 0.31),
 ('LB19200701-V23-07-page2.txt', 0.31),
 ('LB19121101-V15-11-page7.txt', 0.308),
 ('LB19040301-V07-03-page2.txt', 0.304),
 ('LB19000501-V03-03-page28.txt', 0.299),
 ('LB19040901-V07-09-page1.txt', 0.296),
 ('LB19180201-V21-02-page18.txt', 0.294),
 ('LB19050601-V08-06-page1.txt', 0.29),
 ('LB19000701-V03-05-page32.txt', 0.288),
 ('LB19060301-V09-03-page1.txt', 0.286),
 ('LB19001101-V03-09-page2.txt', 0.286),
 ('LB19031201-V06-12-page1.txt', 0.286),
 ('LB19081201-V11-12-page1.txt', 0.286),
 ('LB19060201-V09-02-page1.txt', 0.286),
 ('LB19030401-V06-04-page16.txt', 0.286),
 ('LB19050101-V08-01-page1.txt', 0.286),
 ('LB19060701-V09-07-page1.txt', 0.281),
 ('LB19051201-V08-12-page1.txt', 0.281),
 ('LB19051101-V08-11-page1.txt', 0.28),
 ('LB19030401-V06-04-page1.txt', 0.278),
 ('LB19090201-V12-02-page14.txt', 0.273),
 ('LB19051001-V08-10-page1.txt', 0.273),
 ('LB19010501-V04-03-page17.txt', 0.271),
 ('LB19050201-V08-02-page1.txt', 0.269),
 ('LB19041201-V07-12-page1.txt', 0.269),
 ('LB19030601-V06-06-page1.txt', 0.267),
 ('LB19190801-V22-08-page2.txt', 0.267),
 ('LB19080501-V11-05-page1.txt', 0.267),
 ('LB19041001-V07-10-page1.txt', 0.265),
 ('LB19080501-V11-05-page2.txt', 0.258),
 ('LB19060401-V09-04-page1.txt', 0.258),
 ('LB19071201-V10-12-page1.txt', 0.257),
 ('LB19010801-V04-06-page2.txt', 0.256),
 ('LB19040201-V07-02-page2.txt', 0.255),
 ('LB19010901-V04-07-page2.txt', 0.253),
 ('LB19021201-V05-12-page8.txt', 0.25),
 ('LB19000601-V03-04-page32.txt', 0.25),
 ('LB19010401-V04-02-page4.txt', 0.25),
 ('LB19030801-V06-08-page1.txt', 0.25),
 ('LB19061101-V09-11-page1.txt', 0.25),
 ('LB19031001-V06-10-page1.txt', 0.25),
 ('LB19060201-V09-02-page7.txt', 0.25),
 ('LB19041101-V07-11-page1.txt', 0.25),
 ('LB19080101-V11-01-page1.txt', 0.25),
 ('LB19060901-V09-09-page1.txt', 0.25),
 ('LB19040301-V07-03-page36.txt', 0.241),
 ('LB19040401-V07-04-page1.txt', 0.24),
 ('LB19020401-V05-04-page1.txt', 0.238),
 ('LB19140201-V17-02-page18.txt', 0.238),
 ('LB19151101-V18-11-page2.txt', 0.237),
 ('LB19030601-V06-06-page32.txt', 0.236),
 ('LB19031101-V06-11-page2.txt', 0.235),
 ('LB19090401-V12-04-page1.txt', 0.231),
 ('LB19010101-V03-11-page2.txt', 0.231),
 ('LB19040201-V07-02-page1.txt', 0.227),
 ('LB19031101-V06-11-page1.txt', 0.227),
 ('LB19070901-V10-09-page1.txt', 0.226),
 ('LB19000401-V03-02-page21.txt', 0.224),
 ('LB19180401-V21-04-page2.txt', 0.224),
 ('LB19090801-V12-08-page1.txt', 0.222),
 ('LB19200301-V23-03-page1.txt', 0.222),
 ('LB19090701-V12-07-page1.txt', 0.222),
 ('LB19191201-V22-12-page1.txt', 0.218),
 ('LB19030301-V06-03-page2.txt', 0.218),
 ('LB19041001-V07-10-page34.txt', 0.217),
 ('LB19090201-V12-02-page1.txt', 0.216),
 ('LB19040201-V07-02-page36.txt', 0.214),
 ('LB19000301-V03-01-page14.txt', 0.213),
 ('LB19140101-V17-01-page2.txt', 0.212),
 ('LB19010301-V04-01-page1.txt', 0.211),
 ('LB19020501-V05-05-page25.txt', 0.209),
 ('LB19041201-V07-12-page2.txt', 0.208),
 ('LB19000101-V02-11-page20.txt', 0.208),
 ('LB19010301-V04-01-page17.txt', 0.207),
 ('LB19070201-V10-02-page1.txt', 0.207),
 ('LB19070501-V10-05-page1.txt', 0.207),
 ('LB19080401-V11-04-page1.txt', 0.206),
 ('LB19090901-V12-09-page1.txt', 0.206),
 ('LB19080201-V11-02-page1.txt', 0.206)]
In [36]:
# %load shared_elements/high_error_rates.py
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.4]

# utilities.open_original_docs(doc_keys, directories['cycle'])

High error documents are image pages, many of which were scanned upside-down and are uncorrectable algorithmically.

Correction 8 -- Remove long errors

In [37]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = cycle
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    replacements.append(clean.check_for_repeating_characters(tokens, "m|M"))
    replacements.append(clean.check_for_repeating_characters(tokens, "i|I"))
    replacements.append(clean.check_for_repeating_characters(tokens, "s|S"))
    replacements.append(clean.check_for_repeating_characters(tokens, "o|O"))
    replacements.append(clean.check_for_repeating_characters(tokens, "e|E"))
    
    replacements = [item for sublist in replacements for item in sublist]
            
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [38]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction8

Average verified rate: 0.9857011567031103

Average of error rates: 0.02319988073941563

Total token count: 5142374

In [39]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[39]:
[('m', 3593),
 ('e', 3408),
 ('d', 3287),
 ("'", 3226),
 ('w', 2715),
 ('t', 2056),
 ('n', 1693),
 ('r', 1397),
 ('x', 1280),
 ('f', 1247),
 ('g', 925),
 ('co', 837),
 ("workingmen's", 678),
 ("prisoners'", 585),
 ('k', 533),
 ('th', 458),
 ('oo', 387),
 ('soul-winning', 348),
 ('u', 307),
 ('hsi', 213),
 ('red-letter', 190),
 ('z', 167),
 ('halsted', 164),
 ('wm', 160),
 ('mo', 154),
 ('rd', 141),
 ('anti-cigarette', 138),
 ('pa', 136),
 ('oc', 133),
 ('ex', 112),
 ('ft', 102),
 ('broken-hearted', 101),
 ('q', 97),
 ('seven-jeweled', 96),
 ('harner', 91),
 ('-', 87),
 ('mcauley', 83),
 ('pp', 82),
 ('pavlson', 81),
 ('stapp', 80),
 ('ti', 79),
 ('al', 76),
 ('ky', 75),
 ('soulwinning', 74),
 ('ninety-six', 74),
 ('pearsons', 72),
 ('good-bye', 70),
 ('li', 70),
 ('heart-broken', 69),
 ('mt', 68)]

Correction 9 -- Separate Squashed Words

In [40]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
In [41]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/LB/correction9

Average verified rate: 0.9857117078086152

Average of error rates: 0.023180083482409065

Total token count: 5142602

In [42]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[42]:
[('m', 3593),
 ('e', 3412),
 ('d', 3287),
 ("'", 3226),
 ('w', 2717),
 ('t', 2057),
 ('n', 1694),
 ('r', 1398),
 ('x', 1280),
 ('f', 1248),
 ('g', 925),
 ('co', 837),
 ("workingmen's", 678),
 ("prisoners'", 585),
 ('k', 533),
 ('th', 458),
 ('oo', 387),
 ('soul-winning', 348),
 ('u', 307),
 ('hsi', 213),
 ('red-letter', 190),
 ('z', 167),
 ('halsted', 164),
 ('wm', 160),
 ('mo', 154),
 ('rd', 141),
 ('anti-cigarette', 138),
 ('pa', 136),
 ('oc', 133),
 ('ex', 112),
 ('ft', 102),
 ('broken-hearted', 101),
 ('q', 97),
 ('seven-jeweled', 96),
 ('harner', 91),
 ('-', 87),
 ('mcauley', 83),
 ('pp', 82),
 ('pavlson', 81),
 ('stapp', 80),
 ('ti', 79),
 ('al', 76),
 ('ky', 75),
 ('soulwinning', 74),
 ('ninety-six', 74),
 ('pearsons', 72),
 ('good-bye', 70),
 ('li', 70),
 ('heart-broken', 69),
 ('mt', 68)]
In [ ]: