GOH-OCR-Evaluation-and-Correction

In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [4]:
%matplotlib inline
In [5]:
wordlist_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists"
wordlists = ["2016-12-07-SDA-last-names.txt", 
             "2016-12-07-SDA-place-names.txt", 
             "2016-12-08-SDA-Vocabulary.txt", 
             "2017-01-03-place-names.txt", 
             "2017-02-14-Base-Word-List-SCOWL&KJV.txt",
             "2017-02-14-Roman-Numerals.txt",
             "2017-03-01-Additional-Approved-Words.txt"
            ]
In [6]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [7]:
title = "GOH"
In [8]:
base_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/{}/".format(title)

Baseline

In [9]:
cycle = 'baseline'
In [10]:
stats = reports.overview_report(join(base_dir, cycle), spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/baseline

Average verified rate: 0.9537225762261566

Average of error rates: 0.05685820895522388

Total token count: 481790

In [11]:
errors_summary = reports.get_errors_summary( stats )
reports.top_errors( errors_summary, 50 )
Out[11]:
[('ñ', 1321),
 ('-', 669),
 ('m', 454),
 ('d', 418),
 ("'", 414),
 ('¥', 353),
 (')', 339),
 ('e', 318),
 ('tion', 258),
 ('*', 247),
 ('con-', 215),
 ('w', 204),
 ('f', 174),
 ('re-', 152),
 ('in-', 152),
 ('(', 151),
 ('g', 142),
 ('r', 134),
 ('t', 133),
 ('_', 124),
 ('dis-', 96),
 ('n', 88),
 ('be-', 85),
 ('¡', 84),
 ('pre-', 80),
 ('per-', 79),
 ('ã', 79),
 ('u', 78),
 ('ex-', 77),
 ('co', 74),
 ('--', 73),
 ('tions', 72),
 ('ment', 66),
 ('com-', 63),
 ('de-', 62),
 ('pro-', 62),
 ('=', 54)]

Check Special Character Use

In [12]:
reports.tokens_with_special_characters(errors_summary)[:100]
Out[12]:
[('ñ', 1321),
 ('¥', 353),
 (')', 339),
 ('*', 247),
 ('(', 151),
 ('_', 124),
 ('¡', 84),
 ('ã', 79),
 ('=', 54),
 ('<', 47),
 ('/', 37),
 ('ô', 25),
 ('ñthe', 24),
 ('¦', 22),
 ('<<', 22),
 ('**', 17),
 ('¥¥', 16),
 ('(see', 15),
 ('(to', 14),
 ('%', 14),
 ('(h', 14),
 ('ñh', 10),
 ('(ps', 10),
 ('(lincoln)', 10),
 ('_health', 10),
 ('ñthese', 10),
 ('(john', 10),
 ('addressñ', 10),
 ('__', 9),
 ('\\', 9),
 ('_-', 9),
 ('(rom', 9),
 ('*-', 9),
 ('(a)', 8),
 ('`', 8),
 ('if/', 8),
 ('(and', 8),
 ('#', 8),
 ('ñit', 7),
 ('ñthat', 7),
 ('-*', 7),
 ('[the', 7),
 ('ñin', 7),
 ('(matt', 7),
 ('___', 7),
 ('kñ', 7),
 ('(concluded', 7),
 ('(b)', 7),
 ('ñby', 6),
 ('(isa', 6),
 ('(c)', 6),
 ('(which', 6),
 (']', 6),
 ('(one', 6),
 ('(right)', 6),
 ('ãã', 6),
 ('ã_', 5),
 ('(acts', 5),
 ('(the', 5),
 ('[', 5),
 ('(a', 5),
 ('¥the', 5),
 ('ñi', 5),
 ('==', 5),
 ('(eph', 5),
 ("'¥", 5),
 ('ña', 4),
 ('(d)', 4),
 ("'*", 4),
 ('ñthen', 4),
 ('+', 4),
 ('pur\x8ee', 4),
 ('=-', 4),
 ('(good', 4),
 ('news)', 4),
 ('(six', 4),
 ('(heb', 4),
 ('[or', 4),
 ('ñwhat', 4),
 ('(or', 4),
 ('ñbecause', 4),
 ('(special', 4),
 ("*'", 4),
 ('¨', 4),
 ('ñibid', 4),
 ('caramel=', 4),
 ('¥¥¥¥', 3),
 ('(james', 3),
 ('(in', 3),
 ('(fig', 3),
 ('q¨', 3),
 ('ã-', 3),
 ('******************', 3),
 ('(c', 3),
 ('(luke', 3),
 ('____', 3),
 ('(first', 3),
 ('months)', 3),
 ('(gen', 3),
 ('ñgod', 3)]

Correction 1 -- Normalize Characters

In [13]:
# %load shared_elements/normalize_characters.py
prev = "baseline"
cycle = "correction1"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)

    # Substitute formatted apostrophe
    content = re.sub(r"’", r"'", content)
    
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    content = re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 1

In [14]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction1

Average verified rate: 0.9620343555361606

Average of error rates: 0.04705820895522388

Total token count: 478933

In [15]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[15]:
[('-', 784),
 ("'", 464),
 ('m', 463),
 ('d', 429),
 ('e', 347),
 ('tion', 258),
 ('con-', 215),
 ('w', 209),
 ('f', 191),
 ('g', 153),
 ('in-', 153),
 ('re-', 152),
 ('t', 146),
 ('r', 145),
 ('n', 100),
 ('--', 97),
 ('dis-', 96),
 ('be-', 85),
 ('u', 80),
 ('pre-', 80),
 ('per-', 79),
 ('ex-', 77),
 ('co', 75),
 ('tions', 72),
 ('ment', 66),
 ('com-', 64),
 ('de-', 62),
 ('pro-', 62),
 ('im-', 48),
 ('mis-', 47),
 ('k', 46),
 ('ful', 45),
 ('ments', 44),
 ('un-', 43),
 ('q', 41),
 ('---', 40),
 ('ple', 37),
 ('sionary', 37),
 ('ical', 37),
 ('prin-', 37),
 ('sub-', 36),
 ('sani-', 33),
 ('op', 32),
 ('ac-', 32),
 ('tarium', 32),
 ('ciples', 31),
 ('condi-', 31),
 ('db', 31),
 ('princi-', 31),
 ('ber', 31)]

Correction 2 -- Fix line endings

In [16]:
# %load shared_elements/correct_line_endings.py
prev = "correction1"
cycle = "correction2"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    content = re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 2

In [17]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction2

Average verified rate: 0.9810627054607609

Average of error rates: 0.028468656716417913

Total token count: 472718

In [18]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[18]:
[('-', 781),
 ("'", 464),
 ('m', 463),
 ('d', 429),
 ('e', 346),
 ('w', 209),
 ('f', 191),
 ('g', 153),
 ('t', 145),
 ('r', 144),
 ('n', 100),
 ('--', 97),
 ('u', 80),
 ('co', 75),
 ('k', 46),
 ('q', 41),
 ('---', 40),
 ('op', 32),
 ('db', 31),
 ('camp-ground', 30),
 ('ft', 29),
 ('th', 27),
 ("''", 27),
 ("workingmen's", 24),
 ('z', 22),
 ('abbie', 20),
 ('ex', 17),
 ("the'", 16),
 ('mc', 16),
 ('x', 16),
 ('grape-sugar', 15),
 ('left-over', 14),
 ('lenna', 13),
 ("'the", 13),
 ('----', 12),
 ('wm', 12),
 ('tion', 12),
 ("'s", 11),
 ('flesh-foods', 11)]

Correction 3 -- Remove extra dashes

In [19]:
# %load shared_elements/remove_extra_dashes.py
prev = "correction2"
cycle = "correction3"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    for token in tokens:
        if token[0] is "-":
            replacements.append((token, token[1:]))
            
        elif token[-1] is "-":
            replacements.append((token, token[:-1]))
        else:
            pass
        
    if len(replacements) > 0:
#         print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 3

In [20]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction3

Average verified rate: 0.9840901635007143

Average of error rates: 0.023855223880597017

Total token count: 472475

In [21]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[21]:
[("'", 481),
 ('m', 463),
 ('d', 430),
 ('e', 351),
 ('w', 210),
 ('f', 197),
 ('g', 157),
 ('t', 155),
 ('r', 151),
 ('n', 101),
 ('co', 100),
 ('u', 81),
 ('k', 47),
 ('q', 41),
 ('op', 33),
 ('db', 31),
 ('ft', 30),
 ("''", 29),
 ('th', 27),
 ('camp-ground', 26),
 ('z', 25),
 ("workingmen's", 24),
 ('abbie', 20),
 ('ex', 18),
 ('x', 18),
 ("the'", 16),
 ('re', 16),
 ('mc', 16),
 ('lenna', 13),
 ("'the", 13),
 ('left-over', 13),
 ('wm', 12),
 ('-', 12),
 ('tion', 12),
 ("'s", 11)]

Correction 4 -- Remove extra quotation marks

In [22]:
# %load shared_elements/replace_extra_quotation_marks.py
prev = "correction3"
cycle = "correction4"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    corrections = []
    for token in tokens:
        token_list = list(token)
        last_char = token_list[-1]

        if last_char is "'":
            if len(token) > 3:
                if token_list[-2] is 's' or 'S':
                    pass
                else:
                    corrections.append((token, re.sub(r"'", r"", token)))
            else:
                pass
        elif token[0] is "'":
            corrections.append((token, re.sub(r"'", r"", token)))   
        else:
            pass
    
    if len(corrections) > 0:
#         print('{}: {}'.format(filename, corrections))

        for correction in corrections:
            content = clean.replace_pair(correction, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 4

In [23]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction4

Average verified rate: 0.9848074952794642

Average of error rates: 0.022889552238805972

Total token count: 472404

In [24]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[24]:
[('m', 465),
 ('d', 430),
 ("'", 411),
 ('e', 354),
 ('w', 213),
 ('f', 198),
 ('t', 158),
 ('g', 157),
 ('r', 151),
 ('n', 103),
 ('co', 100),
 ('u', 81),
 ('k', 48),
 ('q', 41),
 ('op', 33),
 ('db', 31),
 ('ft', 30),
 ('th', 27),
 ('camp-ground', 26),
 ('z', 25),
 ("workingmen's", 24),
 ("''", 23),
 ('abbie', 20),
 ('ex', 18),
 ('x', 18),
 ('mc', 16),
 ('re', 16),
 ('lenna', 13),
 ("the'", 13),
 ('left-over', 13),
 ('wm', 12),
 ('-', 12),
 ('tion', 12)]

Correction 5 -- Rejoin Burst Words

In [25]:
# %load shared_elements/rejoin_burst_words.py
prev = "correction4"
cycle = "correction5"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    pattern = re.compile("(\s(\w{1,2}\s){5,})")
    
    replacements = []
    clean.check_splits(pattern, spelling_dictionary, content, replacements)
    
    if len(replacements) > 0:
#         print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()

Check Correction 5

In [26]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction5

Average verified rate: 0.9848198749349061

Average of error rates: 0.022858208955223884

Total token count: 472394

In [27]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[27]:
[('m', 465),
 ('d', 429),
 ("'", 411),
 ('e', 353),
 ('w', 213),
 ('f', 198),
 ('g', 157),
 ('t', 157),
 ('r', 150),
 ('n', 102),
 ('co', 100),
 ('u', 81),
 ('k', 47),
 ('q', 41),
 ('op', 33),
 ('db', 31),
 ('ft', 30),
 ('th', 27),
 ('camp-ground', 26),
 ('z', 25),
 ("workingmen's", 24),
 ("''", 23),
 ('abbie', 20),
 ('ex', 18),
 ('x', 18),
 ('mc', 16),
 ('re', 16),
 ('lenna', 13),
 ("the'", 13),
 ('left-over', 13),
 ('wm', 12),
 ('-', 12),
 ('tion', 12)]

Correction 6 -- Rejoin Split Words

In [28]:
# %load shared_elements/rejoin_split_words.py
prev = "correction5"
cycle = "correction6"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
GOH18970201-V01-01-page16.txt: [('nu', 'rses')]
GOH18970201-V01-01-page19.txt: [('BROMOS', 'E')]
GOH18970201-V01-01-page20.txt: [('CIRCULA', 'TION'), ('GLAS', 'GOW')]
GOH18970301-V01-02-page18.txt: [('Mc', 'Coy')]
GOH18970301-V01-02-page19.txt: [('Prin', 'ciples'), ('va', 'n')]
GOH18970301-V01-02-page20.txt: [('CIRCULA', 'TION'), ('GLAS', 'GOW')]
GOH18970301-V01-02-page5.txt: [('RE', 'FORM')]
GOH18970601-V01-05,06-page11.txt: [('AMERI', 'CAN')]
GOH18970601-V01-05,06-page21.txt: [('PUBLICA', 'TIONS')]
GOH18970601-V01-05,06-page26.txt: [('re', 'establishing'), ('perha', 'ps')]
GOH18970601-V01-05,06-page35.txt: [('PR', 'ES')]
GOH18970601-V01-05,06-page5.txt: [('AMERI', 'CAN')]
GOH18970601-V01-05,06-page8.txt: [('MIS', 'SIONARY')]
GOH18970801-V01-07-page18.txt: [('somethi', 'ng')]
GOH18970801-V01-07-page20.txt: [('Sani', 'tarium')]
GOH18970801-V01-07-page5.txt: [('askin', 'g')]
GOH18971001-V01-09-page19.txt: [('Sani', 'tarium')]
GOH18971001-V01-09-page7.txt: [('co', 'workers')]
GOH18971201-V01-11-page1.txt: [('COU', 'GH')]
GOH18971201-V01-11-page14.txt: [('AMERI', 'CAN')]
GOH18971201-V01-11-page16.txt: [('mis', 'sionaries')]
GOH18971201-V01-11-page18.txt: [('Gos', 'PEL')]
GOH18971201-V01-11-page19.txt: [('fl', 'It')]
GOH18971201-V01-11-page7.txt: [('tid', 'bits')]
GOH18980101-V02-01-page16.txt: [('MIS', 'SIONARY')]
GOH18980101-V02-01-page2.txt: [('POPU', 'LARLY')]
GOH18980101-V02-01-page22.txt: [('co', 'operate'), ('ex', 'pected')]
GOH18980101-V02-01-page3.txt: [('co', 'operative')]
GOH18980101-V02-01-page5.txt: [('co', 'operation'), ('thei', 'r')]
GOH18980101-V02-01-page7.txt: [('co', 'operation')]
GOH18980101-V02-01-page9.txt: [('Mc', 'Dowell')]
GOH18980201-V02-02-page18.txt: [('Gos', 'PEL')]
GOH18980301-V02-03-page1.txt: [('COMMIS', 'SION')]
GOH18980301-V02-03-page16.txt: [('co', 'operate'), ('ca', 'n')]
GOH18980501-V02-05-page17.txt: [('indiffer', 'ence'), ('RELA', 'TION')]
GOH18980501-V02-05-page20.txt: [('re', 'turn')]
GOH18980501-V02-05-page6.txt: [('re', 'ceived')]
GOH18980601-V02-06-page19.txt: [('CHIL', 'DREN')]
GOH18980601-V02-06-page2.txt: [('se', 'as')]
GOH18980601-V02-06-page21.txt: [('Healthf', 'ul')]
GOH18980601-V02-06-page24.txt: [('co', 'operation'), ('Gos', 'PEL')]
GOH18980601-V02-06-page27.txt: [('Ra', 'T'), ('tS', 'P')]
GOH18980701-V02-07-page17.txt: [('impris', 'onment'), ('Gos', 'PEL')]
GOH18980701-V02-07-page18.txt: [('SANI', 'TARIUM')]
GOH18980701-V02-07-page19.txt: [('ORDI', 'NARY')]
GOH18980701-V02-07-page2.txt: [('al', 'Ways')]
GOH18980701-V02-07-page20.txt: [('ry', 'e')]
GOH18980801-V02-08-page15.txt: [('Associa', 'tion'), ('co', 'operation')]
GOH18980801-V02-08-page16.txt: [('Mc', 'Clure'), ('co', 'operate')]
GOH18980801-V02-08-page2.txt: [('eV', 'I'), ('fo', 'r')]
GOH18980801-V02-08-page26.txt: [('condi', 'tions')]
GOH18980801-V02-08-page28.txt: [('ac', 'cessible')]
GOH18980801-V02-08-page5.txt: [('Mc', 'Coy')]
GOH18980901-V02-09-page24.txt: [('Mc', 'Coy')]
GOH18980901-V02-09-page27.txt: [('M.', '')]
GOH18980901-V02-09-page28.txt: [('invari', 'ably')]
GOH18981001-V02-10-page12.txt: [('fr', 'y')]
GOH18981001-V02-10-page27.txt: [('TA', 'RE'), ('RE', 'TREAT')]
GOH18981101-V02-11-page13.txt: [('RECOG', 'NITION')]
GOH18981101-V02-11-page14.txt: [('Gos', 'PEL')]
GOH18981101-V02-11-page18.txt: [('Mc', 'Coy')]
GOH18981101-V02-11-page19.txt: [('ga', 't'), ('gi', 'g'), ('kAk', 'A')]
GOH18981101-V02-11-page2.txt: [('fl', 'O')]
GOH18981101-V02-11-page20.txt: [('VA', 'R')]
GOH18981101-V02-11-page8.txt: [('COMMUNICA', 'TIONS')]
GOH18981201-V02-12-page1.txt: [('ex', 'pected')]
GOH18981201-V02-12-page2.txt: [('HEALT', 'H')]
GOH18981201-V02-12-page25.txt: [('CO', 'O')]
GOH18981201-V02-12-page27.txt: [('Un', 'a'), ('CLAREM', 'ONT'), ('Ti', 'to'), ('RE', 'TREAT')]
GOH18981201-V02-12-page5.txt: [('co', 'operate')]
GOH18990101-V03-01-page11.txt: [('wa', 'tchings')]
GOH18990101-V03-01-page16.txt: [('co', 'operation'), ('Gos', 'PEL')]
GOH18990101-V03-01-page2.txt: [('CR', 'EEK')]
GOH18990101-V03-01-page9.txt: [('bene', 'ficial')]
GOH18990201-V03-02-page18.txt: [('extremi', 'ties')]
GOH18990201-V03-02-page2.txt: [('RE', 'TREAT')]
GOH18990201-V03-02-page5.txt: [('PEO', 'PLE')]
GOH18990201-V03-02-page9.txt: [('enem', 'y')]
GOH18990401-V03-04-page17.txt: [('Mc', 'Coy'), ('Af', 'ter')]
GOH18990401-V03-04-page6.txt: [('co', 'operate')]
GOH18990501-V03-05-page14.txt: [('ea', 't')]
GOH18990501-V03-05-page19.txt: [('Mc', 'Coy')]
GOH18990501-V03-05-page24.txt: [('APPETIZ', 'ING')]
GOH18990601-V03-06-page25.txt: [('co', 'operate')]
GOH18990601-V03-06-page26.txt: [('Cerebro', 'Spinal'), ('Sani', 'tarium'), ('th', 'e')]
GOH18990601-V03-06-page27.txt: [('ef', 'fective')]
GOH18990701-V03-07-page13.txt: [('Mc', 'Coy')]
GOH18990701-V03-07-page14.txt: [('Mc', 'Coy')]
GOH18990701-V03-07-page18.txt: [('co', 'operation'), ('Gos', 'PEL')]
GOH18990701-V03-07-page19.txt: [('dolla', 'r')]
GOH18990701-V03-07-page2.txt: [('spe', 'cial'), ('TA', 'RE'), ('Sla', 'y'), ('recog', 'nized'), ('RE', 'TREAT')]
GOH18990701-V03-07-page7.txt: [('IL', 'E')]
GOH18990901-V03-09-page1.txt: [('hy', 'gienic')]
GOH18990901-V03-09-page10.txt: [('PULMO', 'NARY')]
GOH18990901-V03-09-page2.txt: [('infor', 'mation')]
GOH18990901-V03-09-page5.txt: [('Exten', 'sive')]
GOH18991001-V03-10-page22.txt: [('co', 'ordination')]
GOH18991001-V03-10-page23.txt: [('th', 'e')]
GOH18991001-V03-10-page3.txt: [('redemp', 'tion')]
GOH18991101-V03-11-page1.txt: [('ca', 'pacity')]
GOH18991101-V03-11-page11.txt: [('Sani', 'tarium')]
GOH18991101-V03-11-page16.txt: [('Mc', 'Abee')]
GOH18991101-V03-11-page9.txt: [('co', 'operate')]

Check Correction 6

In [29]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction6

Average verified rate: 0.985174017900436

Average of error rates: 0.02240597014925373

Total token count: 472279

In [30]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[30]:
[('m', 465),
 ('d', 429),
 ("'", 411),
 ('e', 349),
 ('w', 213),
 ('f', 198),
 ('g', 156),
 ('t', 155),
 ('r', 149),
 ('n', 100),
 ('u', 81),
 ('co', 79),
 ('k', 47),
 ('q', 41),
 ('op', 33),
 ('db', 31),
 ('ft', 30),
 ('camp-ground', 26),
 ('th', 25),
 ('z', 25),
 ("workingmen's", 24),
 ("''", 23),
 ('abbie', 20),
 ('x', 18),
 ('ex', 16),
 ('lenna', 13),
 ("the'", 13),
 ('left-over', 13),
 ('wm', 12),
 ('-', 12)]

Correction 7 -- Rejoin Split Words II

In [31]:
# %load shared_elements/rejoin_split_words.py
prev = "correction6"
cycle = "correction7"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)
    
    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    errors = reports.identify_errors(tokens, spelling_dictionary)

    replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=True)
    
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_split_words(replacement, content)
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
GOH18970201-V01-01-page14.txt: [('Y', 'es')]
GOH18970201-V01-01-page20.txt: [('SHAKES', 'PEARE'), ('INTER', 'LAKEN')]
GOH18970301-V01-02-page1.txt: [('C', 'oot')]
GOH18970301-V01-02-page19.txt: [('T', 'WENTY-NINE')]
GOH18970301-V01-02-page20.txt: [('INTER', 'LAKEN')]
GOH18970301-V01-02-page3.txt: [('OR', "PHANS'")]
GOH18970601-V01-05,06-page1.txt: [('C', 'oot')]
GOH18970601-V01-05,06-page10.txt: [('a', 'nd')]
GOH18970601-V01-05,06-page15.txt: [('MED', 'ICAL')]
GOH18970601-V01-05,06-page23.txt: [('f', 'ood')]
GOH18970601-V01-05,06-page5.txt: [('MISSION', 'ARY')]
GOH18970601-V01-05,06-page7.txt: [('MISSION', 'ARY')]
GOH18970601-V01-05,06-page9.txt: [('MISSION', 'ARY')]
GOH18970801-V01-07-page14.txt: [('the', 'se')]
GOH18971001-V01-09-page16.txt: [('HEAL', 'TH')]
GOH18971201-V01-11-page1.txt: [('COU', 'GH')]
GOH18971201-V01-11-page12.txt: [('con', 'stantly')]
GOH18971201-V01-11-page2.txt: [('A', 'ND')]
GOH18980101-V02-01-page18.txt: [('MED', 'ICAL')]
GOH18980101-V02-01-page25.txt: [('MED', 'ICAL')]
GOH18980101-V02-01-page32.txt: [('Mission', 'ary')]
GOH18980101-V02-01-page37.txt: [('i', 'da'), ('grand', 'Pa')]
GOH18980501-V02-05-page17.txt: [('indiffer', 'ence')]
GOH18980501-V02-05-page2.txt: [('dis', 'couraged')]
GOH18980501-V02-05-page20.txt: [('MED', 'ICAL')]
GOH18980601-V02-06-page2.txt: [("'", 're')]
GOH18980601-V02-06-page27.txt: [('S', 'Ra')]
GOH18980601-V02-06-page30.txt: [('o', 'nce')]
GOH18980601-V02-06-page32.txt: [('con', 'cise')]
GOH18980701-V02-07-page1.txt: [('differ', 'ent')]
GOH18980701-V02-07-page2.txt: [('if', 'Ni')]
GOH18980701-V02-07-page20.txt: [('m', 'Oe')]
GOH18980801-V02-08-page28.txt: [('ac', 'cessible')]
GOH18980901-V02-09-page28.txt: [('BAT', 'TLE'), ('he', 'th')]
GOH18981001-V02-10-page12.txt: [('s', 'OP')]
GOH18981001-V02-10-page14.txt: [('poison', 'ous')]
GOH18981001-V02-10-page19.txt: [('pro', 'ducing')]
GOH18981001-V02-10-page26.txt: [('sub', 'scription')]
GOH18981001-V02-10-page28.txt: [('In', 'Cr')]
GOH18981201-V02-12-page5.txt: [('G', 'ranose')]
GOH18981201-V02-12-page9.txt: [('clean', 'liness')]
GOH18990101-V03-01-page9.txt: [('bene', 'ficial')]
GOH18990201-V03-02-page1.txt: [('G', 'uadalajara')]
GOH18990201-V03-02-page24.txt: [('de', 'licious')]
GOH18990301-V03-03-page2.txt: [('I', 'nternational')]
GOH18990301-V03-03-page20.txt: [('Thor', 'oughly')]
GOH18990401-V03-04-page20.txt: [('Thor', 'oughly')]
GOH18990401-V03-04-page7.txt: [('mas', "ter's")]
GOH18990501-V03-05-page1.txt: [('S', 'ite')]
GOH18990601-V03-06-page18.txt: [('flak', 'es')]
GOH18990601-V03-06-page26.txt: [('F', 'Aro'), ('O', 'ne')]
GOH18990601-V03-06-page27.txt: [('ef', 'fective')]
GOH18990601-V03-06-page9.txt: [('in', 'structions')]
GOH18990801-V03-08-page2.txt: [('W', 'ashington')]
GOH18990801-V03-08-page8.txt: [('a', 'ny')]
GOH18990901-V03-09-page1.txt: [('hy', 'gienic')]
GOH18990901-V03-09-page2.txt: [('infor', 'mation')]
GOH18990901-V03-09-page20.txt: [('exceed', 'ingly')]
GOH18990901-V03-09-page5.txt: [('Exten', 'sive')]
GOH18991101-V03-11-page19.txt: [('y', 'ou')]
GOH18991101-V03-11-page5.txt: [('par', 'tition')]

Check Correction 7

In [32]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction7

Average verified rate: 0.9852743431793781

Average of error rates: 0.02224179104477612

Total token count: 472237

In [33]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[33]:
[('m', 464),
 ('d', 429),
 ("'", 410),
 ('e', 349),
 ('w', 212),
 ('f', 197),
 ('g', 155),
 ('t', 154),
 ('r', 149),
 ('n', 100),
 ('u', 81),
 ('co', 79),
 ('k', 47),
 ('q', 41),
 ('op', 33),
 ('db', 31),
 ('ft', 30),
 ('camp-ground', 26),
 ('z', 25),
 ('th', 24),
 ("workingmen's", 24),
 ("''", 23),
 ('abbie', 20),
 ('x', 18),
 ('ex', 16),
 ('lenna', 13),
 ("the'", 13),
 ('left-over', 13),
 ('wm', 12),
 ('-', 12)]

Review Remaining Errors

In [34]:
reports.docs_with_high_error_rate(summary)
Out[34]:
[('GOH18970601-V01-05,06-page17.txt', 0.394),
 ('GOH18981101-V02-11-page19.txt', 0.261),
 ('GOH18980701-V02-07-page19.txt', 0.219),
 ('GOH18990401-V03-04-page2.txt', 0.214),
 ('GOH18981101-V02-11-page2.txt', 0.207),
 ('GOH18980701-V02-07-page20.txt', 0.202)]
In [35]:
doc_keys = [x[0] for x in reports.docs_with_high_error_rate(summary) if x[1] > 0.2]
In [36]:
# utilities.open_original_docs(doc_keys, directories['cycle'])

High errors on image pages and advertisements.

In [37]:
reports.long_errors(errors_summary, min_length=15)
Out[37]:
(['littleunderstood',
  'ttitrittittittittittmtmttittitttttittimttittittittimmttitti',
  'thoughtstimulating',
  'oriderfullrsupcessfal',
  'heaven-appointed',
  'house-furnishing',
  'scienceinthekitchen',
  'spanish-speaking',
  'awnidllinfinthdethieghbeosotkf',
  'applrthescprinciplesitrthe',
  'commander-in-chief',
  'aawaaaaaovvvoovvvvy',
  'tooverindulgence',
  'numbercontaining',
  'cettesseciecceseveessies',
  'wwwwhuowwwwwwwwwwaiwwwwwwwwimaiwuaiiimai',
  'itnhethctecmhtpatiear',
  'csaetanldogfouer',
  'square-shouldered',
  'unimpededlcirculation',
  'carnalmindedness',
  'esmliornagbligeetereetscoga',
  'poverty-stricken',
  'frankfort-on-the-main',
  'blackberry-juice',
  'osteromearammerse',
  'thisadvertisement',
  'sewage-contaminated',
  'aaatasisaasaaalaisa',
  'healthdestroying',
  'health-destroying',
  'insttiitututition',
  'commandment-keeping',
  'boarding-schools',
  'vvvvvvvvvvvvvvykin',
  'slaughter-houses',
  'aiiwtyaiylliyiiimmamosimmonm',
  'timittitiimmimimitttimitmtiiittimmtmmittittimmtimmimmmmittmtm',
  'efflacsamalmomasew',
  'selfforgetfulness',
  'ordinarylcrackers',
  'ordinaryconventional',
  'ihwvinwiniviniaina',
  'training-classes',
  'govtomovgnoougutovnft',
  'followinginstitutions',
  'whitney-salisbury',
  'specialadvantages',
  'self-independence',
  'mtimimmmtimimmttitimimitimmitimitimimiiiii',
  'blessing-accompanying',
  'brain-stupefying',
  'aaaaaaaaaaamaaaaaaaaa',
  'brain-destroying',
  'atithatameloftwo',
  'self-forgetfulness',
  'mmmmmammmammmmimawnam',
  'energy-consuming',
  'energy-producing',
  'hundredattlaaerintiona',
  'tikviavtvyamosionononswwww',
  'self-righteousness',
  'wtitniosonomionli',
  'ulialluilumuluiliumulimam',
  'tawaaaaaaaaaaaaom',
  'mtimmtmmtmemmmrwrmyrymmirimvmmmwmt',
  'extraordinaryway',
  'trdttttitattzwzuw',
  'menniryfrimyyttittyityyttimmityvvvniyvvmmvvvvvyvvrrmyyttmitmimmitttyvvvvm',
  'kibroth-hattaavah',
  'poison-producing',
  'encouragement-to',
  'self-justification',
  'lailuuliliuwwwwwwwwwwwwwwwwwwwwaualuuumuuukiliwwwwww',
  'blood-bespattered',
  'zannaanaaaaaaaawaa',
  'universattestimoni',
  'vvvvyvvvvvvvvvyvvvvvvvvvvvvviaaaaaanyvvvvvvp',
  "years'experiencein",
  'pleasure-seeking',
  'sanitaritimdnring',
  'electric-thermal',
  'genito-urinaryand',
  'fellifiniffeemiiiiiitttliiiiiiiiiimmoionnowannomp',
  'ljazreeitnihsichaous',
  'alkaloidcontaining',
  'mttttttttttttuattt',
  'consumptiongerms',
  'blood-corpuscles',
  'iiiiiiiiilllliiiil',
  'self-gratification',
  'exastwoeftwwwwiess',
  'titityymyytitymyttivityymyymtmitymtityiti',
  'ordinaryrxrackers',
  'sanitaryandelectricalsupplyco',
  'offensive-smelling',
  'fourteen-year-old',
  'isksymonsemmeartal',
  'airawsaaaaaasaaf',
  'milloaawymemoamom',
  'andelectricalsupplyco',
  'counter-irritant'],
 15)

Correction 8 -- Remove long error tokens

In [39]:
# %load shared_elements/remove-tokens-with-long-strings-of-characters.py
prev = cycle
cycle = "correction8"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = re.sub(r"[0-9,!?$:;&]", " ", content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    replacements.append(clean.check_for_repeating_characters(tokens, "a|A"))
    replacements.append(clean.check_for_repeating_characters(tokens, "v|V"))
    replacements.append(clean.check_for_repeating_characters(tokens, "w|W"))
    replacements.append(clean.check_for_repeating_characters(tokens, "i|I"))
    replacements.append(clean.check_for_repeating_characters(tokens, "m|M"))
    replacements.append(clean.check_for_repeating_characters(tokens, "t|T"))
    
    replacements = [item for sublist in replacements for item in sublist]
            
    if len(replacements) > 0:
        print('{}: {}'.format(filename, replacements))

        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    else:
        pass
    
    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
GOH18970201-V01-01-page2.txt: [('MTIMIMMMTIMIMMTTITIMIMITIMMITIMITIMIMIIIII', ' ')]
GOH18970201-V01-01-page20.txt: [('lailuuliliuwwwwwwwwwwwwwwwwwwwwauAluuumuuukiliwwwwww', ' ')]
GOH18970301-V01-02-page2.txt: [('TIMITTITIIMMIMIMITTTIMITMTIIITTIMMTMMITTITTIMMTIMMIMMMMitTMTM', ' '), ('TIMITTITIIMMIMIMITTTIMITMTIIITTIMMTMMITTITTIMMTIMMIMMMMitTMTM', ' ')]
GOH18970301-V01-02-page20.txt: [('WWWWHUOWWWWWWWWWWAIWWWWWWWWIMAIWUAIIIMAI', ' '), ('mtimmtmmTmemmmrwrmyrymmirimvmmmwmt.', ' ')]
GOH18970601-V01-05,06-page2.txt: [('MMMMAL"MMMMAMMWMMM', ' ')]
GOH18970801-V01-07-page1.txt: [('TTITriTTITTITTITTITTMTMTTITTITTTTTITTIMTTITTITTITTIMMTTITTI', ' ')]
GOH18970801-V01-07-page2.txt: [('MttttttttttttUAttt', ' ')]
GOH18970801-V01-07-page20.txt: [('AAAAAAAAAAAAAA', ' '), ('ZANNAANAAAAAAAAWAA', ' '), ('AAAAAAAAAAAMAAAAAAAAA', ' '), ('tAWAAAAAAAAAAAAOM', ' ')]
GOH18971001-V01-09-page19.txt: [('VVVVYVVVVVVVVVYVVVVVVVVVVVVVIAAAAAANYVVVVVVP', ' ')]
GOH18971001-V01-09-page3.txt: [('iiiiiiiiilllliIIIl', ' ')]
GOH18980601-V02-06-page2.txt: [('aaasOlaaaaaaop', ' ')]
GOH18981001-V02-10-page1.txt: [('MMMMMAMMMAMMMMIMAWNAM', ' ')]
GOH18990301-V03-03-page2.txt: [('VVVVVVVVVVVVVVYkin', ' ')]
GOH18990601-V03-06-page27.txt: [('airawsaaaaaasaaf', ' '), ('aaatasisaasaaalaisa', ' ')]
GOH18990701-V03-07-page20.txt: [('menniryfrIMYYTTITTYITYYTTIMMITYvvvniyvvmmvvvvvyvvrrMYYTTMITMIMMITTTyvvvvM', ' '), ('menniryfrIMYYTTITTYITYYTTIMMITYvvvniyvvmmvvvvvyvvrrMYYTTMITMIMMITTTyvvvvM', ' '), ('menniryfrIMYYTTITTYITYYTTIMMITYvvvniyvvmmvvvvvyvvrrMYYTTMITMIMMITTTyvvvvM', ' ')]
GOH18990801-V03-08-page20.txt: [('FellifiNiffeeMiiiiiitttliiiiiiiiiiMMOIONNOWANNOMP', ' ')]

Correction 9 -- Separate Squashed Words

In [41]:
# %load shared_elements/separate_squashed_words.py
import pandas as pd
from math import log

prev = cycle
cycle = "correction9"

directories = utilities.define_directories(prev, cycle, base_dir)
if not os.path.exists(directories['cycle']):
    os.makedirs(directories['cycle'])

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

verified_tokens = []

for filename in corpus:  
    content = utilities.readfile(directories['prev'], filename)
    clean.get_approved_tokens(content, spelling_dictionary, verified_tokens)

tokens_with_freq = dict(collections.Counter(verified_tokens))
words = pd.DataFrame(list(tokens_with_freq.items()), columns=['token','freq'])
words_sorted = words.sort_values('freq', ascending=False)
words_sorted_short = words_sorted[words_sorted.freq > 2]

sorted_list_of_words = list(words_sorted_short['token'])

wordcost = dict((k, log((i+1)*log(len(sorted_list_of_words)))) for i,k in enumerate(sorted_list_of_words))
maxword = max(len(x) for x in sorted_list_of_words)

corpus = (f for f in listdir(directories['prev']) if not f.startswith('.') and isfile(join(directories['prev'], f)))

for filename in corpus:
    content = utilities.readfile(directories['prev'], filename)

    text = utilities.strip_punct(content)
    tokens = utilities.tokenize_text(text)
    
    replacements = []
    
    for token in tokens:
        if not token.lower() in spelling_dictionary:
            if len(token) > 17:
                if re.search(r"[\-\-\'\"]", token):
                    pass
                else:
                    split_string = clean.infer_spaces(token, wordcost, maxword)
                    list_split_string = split_string.split()
                    
                    if clean.verify_split_string(list_split_string, spelling_dictionary):
                        replacements.append((token, split_string))
                    else:
                        pass
            else:
                pass
        else:
            pass
        
    if len(replacements) > 0:
        print("{}: {}".format(filename, replacements))
        
        for replacement in replacements:
            content = clean.replace_pair(replacement, content)
    
    else:
        pass

    with open(join(directories['cycle'], filename), mode="w") as o:
        o.write(content)
        o.close()
GOH18980601-V02-06-page30.txt: [('thoughtstimulating', 'thought stimulating')]
GOH18980801-V02-08-page2.txt: [('ScienceintheKitchen', 'Science in the Kitchen')]
GOH18980901-V02-09-page17.txt: [('ordinaryconventional', 'ordinary conventional')]
GOH18981201-V02-12-page27.txt: [('followinginstitutions', 'following institutions')]
In [44]:
# %load shared_elements/summary.py
summary = reports.overview_report(directories['cycle'], spelling_dictionary, title)
Directory: /Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/GOH/correction9

Average verified rate: 0.9853289032042201

Average of error rates: 0.022110447761194033

Total token count: 472221

In [45]:
# %load shared_elements/top_errors.py
errors_summary = reports.get_errors_summary( summary )
reports.top_errors( errors_summary, 10 )[:50]
Out[45]:
[('m', 464),
 ('d', 429),
 ("'", 410),
 ('e', 349),
 ('w', 212),
 ('f', 197),
 ('g', 155),
 ('t', 154),
 ('r', 149),
 ('n', 100),
 ('u', 81),
 ('co', 79),
 ('k', 47),
 ('q', 41),
 ('op', 33),
 ('db', 31),
 ('ft', 30),
 ('camp-ground', 26),
 ('z', 25),
 ('th', 24),
 ("workingmen's", 24),
 ("''", 23),
 ('abbie', 20),
 ('x', 18),
 ('ex', 16),
 ('lenna', 13),
 ("the'", 13),
 ('left-over', 13),
 ('wm', 12),
 ('-', 12)]
In [ ]: