Example of OCR Cleaning
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
# Load libraries
from text2topics import reports
from text2topics import utilities
from text2topics import clean
import re
import os
from os import listdir
from os.path import isfile, join
import collections
In [3]:
wordlist_dir = "../data/word-lists/"
wordlists = ["2016-12-07-SDA-last-names.txt",
"2016-12-07-SDA-place-names.txt",
"2016-12-08-SDA-Vocabulary.txt",
"2017-01-03-place-names.txt",
"2017-02-14-Roman-Numerals.txt",
"2017-03-01-Additional-Approved-Words.txt",
"2017-05-05-base-scowl-list.txt",
"2017-05-24-kjv-wordlist.txt"
]
In [4]:
spelling_dictionary = utilities.create_spelling_dictionary(wordlist_dir, wordlists)
In [29]:
content = utilities.readfile("../data/", "HR18660801-V01-01-page3.txt")
print(content)
The first step is to get an overview of the errors
In [23]:
reports.identify_errors(utilities.to_lower(utilities.tokenize_text(utilities.strip_punct(test_file))), spelling_dictionary)
Out[23]:
Correct apostrophy
In [ ]:
def replace_apostrophe_error(content):
"""Use regex to
Note:
Use this function before running :func:`remove_special_chars`.
Args:
content(str): File content as string
Returns
str: Files content with apostrophes correctly.
"""
return re.sub(r"(\w+)(õ|Õ)", r"\1'", content)
In [28]:
content = replace_apostrophe_error(content)
print(content)
Next step is to standardize and remove special characters
In [ ]:
def remove_special_chars(content):
"""Use regex to remove special characters except for punctuation.
Note:
Modify this function before use if content includes characters from languages other than English.
Args:
content(str): File content as string
Returns:
str: File content with special characters removed.
"""
# Replace all special characters with a space (as these tend to occur at the end of lines)
return re.sub(r"[^a-zA-Z0-9\s,.!?$:;\-&\'\"]", r" ", content)
def normalize_chars(content):
"""Use regex to normalizes dash and apostrophe characters.
Args:
content(str): File content as string
Returns:
str: file content with normalized characters as a string.
"""
# Substitute for all other dashes
content = re.sub(r"—-—–‑", r"-", content)
# Substitute formatted apostrophe
content = re.sub(r"\’\’\‘\'\‛\´", r"'", content)
return content
In [30]:
content = remove_special_chars(normalize_chars(test_file))
print(content)
In [31]:
reports.identify_errors(utilities.to_lower(utilities.tokenize_text(utilities.strip_punct(content))), spelling_dictionary)
Out[31]:
Next is to reconnect words where the line-ending was incorrectly interpreted.
In [ ]:
def connect_line_endings(content):
"""Use regex to reconnect two word segments separated by "- ".
Note:
Use :func:`normalize_chars` before running `correct_line_endings`
Args:
content(str): File content.
Returns:
str: File content with words rejoined.
"""
return re.sub(r"(\w+)(\-\s{1,})([a-z]+)", r"\1\3", content)
In [39]:
content = connect_line_endings(content)
print(content)
In [40]:
reports.identify_errors(utilities.to_lower(utilities.tokenize_text(utilities.strip_punct(content))), spelling_dictionary)
Out[40]:
In [ ]:
def rejoin_split_words(content, spelling_dictionary, get_prior=False):
"""
"""
tokens = utilities.tokenize_text(utilities.strip_punct(content))
errors = reports.identify_errors(tokens, spelling_dictionary)
replacements = clean.check_if_stem(errors, spelling_dictionary, tokens, get_prior=False)
if len(replacements) > 0:
for replacement in replacements:
print(replacement)
content = clean.replace_split_words(replacement, content)
else:
print("No replacement pairs found.")
return content
In [45]:
content = rejoin_split_words(content, spelling_dictionary)
print(content)
In [46]:
reports.identify_errors(utilities.to_lower(utilities.tokenize_text(utilities.strip_punct(content))), spelling_dictionary)
Out[46]:
In [ ]: