calculate-error-rates-round-4
Table of Contents¶
In [1]:
from os import listdir
from os.path import isfile, join
import csv
import datetime
import nltk
from nltk.corpus import words
from nltk import word_tokenize
import re
In [2]:
input_dir = "/Users/jeriwieringa/Dissertation/text/text-current/2016-11-16-corpus-with-preliminary-cleaning/"
out_dir = "/Users/jeriwieringa/Dissertation/drafts/data/spelling-statistics/round4/"
word_list_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists/"
In [3]:
corpus = [f for f in listdir(input_dir) if not f.startswith('.') and isfile(join(input_dir, f))]
In [4]:
titles = ["ADV", "AmSn", "ARAI", "CE", "CUV", "EDU", "GCB", "GH", "GOH", "GS", "HM", "HR",
"IR", "LB", "LH", "LibM", "LUH", "NMN","PHJ","PTAR","PUR","RH","Sligo","SOL",
"ST","SUW","TCOG","TMM","WMH","YI"]
In [5]:
# Function for pulling words from a txt file
def load_from_txt(file_name):
with open(file_name, "r") as txt:
words = txt.read().splitlines()
word_list = [w.lower() for w in words]
return(word_list)
In [6]:
generic_list = load_from_txt(join(word_list_dir, '2016-12-06-First-SDA-Word-List.txt'))
person_names = load_from_txt(join(word_list_dir, '2016-12-07-SDA-last-names.txt'))
place_names = load_from_txt(join(word_list_dir, '2016-12-07-SDA-place-names.txt'))
place_names_2 = load_from_txt(join(word_list_dir, '2017-01-03-place-names.txt'))
sda_words = load_from_txt(join(word_list_dir, '2016-12-08-SDA-Vocabulary.txt'))
In [7]:
spelling_dictionary = list(set(generic_list + person_names + place_names + place_names_2 + sda_words))
In [8]:
def refresh_dictionary():
sda_words = load_from_txt(join(word_list_dir, '2016-12-08-SDA-Vocabulary.txt'))
return(list(set(generic_list + person_names + place_names + place_names_2 + sda_words)))
In [9]:
def check_words(text, filename, spell_dictionary):
# Clean 1:
'''
Replace punctuation with a space to avoid attaching line ending errors to words.
Remove the '-' of hyphenated words. This allows me to check value of
each part of the combined word, without having to expand the dictionary too much.
Also allows for greater variability in the construction of hyphenated words
(as was often the case in 19th century writing.)
'''
text_cleaned = re.sub(r"[0-9,.!?$:;]\|", " ", text)
# Special line for dashes to account for the variety of encodings
text_cleaned = re.sub(r"[-—–‑]", " ", text_cleaned)
# Clean 2:
'''
Correct occurances of wordsõ and wordõs to words' and word's. This pattern is seen in ADV, HR, and SUW.
õ does not occur as a spelling error in the other periodical titles. However this pattern should prevent a
too-greedy clearing out of the character.
'''
text_cleaned = re.sub(r"(\w+)(õ|Õ)", r"\1'", text_cleaned)
# Clean 3:
'''
Correct for names that have run together (and possibly some phrases) by identifying words with capitalization
in the middle, isolating the capitals, and adding a space.
Solution from stack overflow: http://stackoverflow.com/questions/1097901/
'''
text_cleaned = re.sub(r"((?<=[a-z])[A-Z]|[A-Z](?=[a-z]))", r" \1", text_cleaned)
# Clean 4:
'''
Remove all non-alpha characters. As these can be found in the middle of words, I am not replacing them with a
space, as I did above. Leave "'" for possessives and contractions.
'''
text_cleaned = re.sub(r"[^\s[a-zA-Z'’]", "", text_cleaned)
tokens = word_tokenize(text_cleaned)
tokens_lower = [w.lower() for w in tokens]
# print(tokens_lower)
errors = set(tokens_lower)-set(spelling_dictionary)
freq_distribution = nltk.FreqDist(tokens_lower)
error_report = {}
error_total = 0
for error in list(errors):
error_count = freq_distribution[error]
error_total = error_total + error_count
error_report.update({error:error_count})
overview = {'doc_id': filename, 'num_tokens': len(tokens), 'num_errors': error_total, 'errors': error_report}
return(overview)
In [10]:
def process_texts(title):
statistics = []
for filename in corpus:
if filename.startswith(title):
# print(filename)
with open(input_dir + filename, "r") as f:
content = f.read()
stats = check_words(content, filename, spelling_dictionary)
statistics.append(stats)
return(statistics)
In [11]:
def test_process(file):
with open(input_dir + file, "r") as f:
print(file)
content = f.read()
print(content)
stats = check_words(content, file, spelling_dictionary)
print("Errors: {}".format(stats['errors']))
print(stats)
In [12]:
test_process('AmSn18910402-V06-14-page1.txt')
In [13]:
from collections import Counter
import csv
def process_title(title):
print(title)
statistics = process_texts(title)
# Get summary statistics on the errors
all_errors = [report['errors'] for report in statistics]
inp = [dict(x) for x in all_errors]
errors_summary = Counter()
for y in inp:
errors_summary += Counter(y)
with open("{}{}-corpus-spelling-errors-round-4-{}.csv".format(out_dir, str(datetime.date.today()), title),
"w") as csv_file:
fieldnames = ['doc_id', 'num_tokens', 'num_errors', 'errors']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for stats in statistics:
writer.writerow(stats)
# Save error counts for each periodical title
with open("{}{}-Spelling-Errors-{}.txt".format(out_dir, str(datetime.date.today()), title), "w") as outfile:
fieldnames2 = ['spell_error', 'count']
writer2 = csv.writer(outfile)
writer2.writerow(fieldnames2)
for key,value in errors_summary.items():
writer2.writerow([key, value])
In [30]:
%time process_title('ADV')
In [31]:
%time process_title('AmSn')
In [32]:
%time process_title('ARAI')
In [36]:
%time process_title('CE')
In [39]:
%time process_title('CUV')
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('EDU')
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('GCB')
In [16]:
spelling_dictionary = refresh_dictionary()
%time process_title('GH')
In [17]:
spelling_dictionary = refresh_dictionary()
%time process_title('GOH')
In [18]:
spelling_dictionary = refresh_dictionary()
%time process_title('GS')
In [13]:
spelling_dictionary = refresh_dictionary()
%time process_title('HM')
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('HR')
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('IR')
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('LB')
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('LH')
In [16]:
spelling_dictionary = refresh_dictionary()
%time process_title('LibM')
In [17]:
spelling_dictionary = refresh_dictionary()
%time process_title('LUH')
In [18]:
spelling_dictionary = refresh_dictionary()
%time process_title('NMN')
In [19]:
spelling_dictionary = refresh_dictionary()
%time process_title('PHJ')
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('PTAR')
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('PUR')
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('RH')
In [16]:
spelling_dictionary = refresh_dictionary()
%time process_title('Sligo')
In [17]:
spelling_dictionary = refresh_dictionary()
%time process_title('SOL')
In [18]:
spelling_dictionary = refresh_dictionary()
%time process_title('ST')
In [19]:
spelling_dictionary = refresh_dictionary()
%time process_title('SUW')
In [20]:
spelling_dictionary = refresh_dictionary()
%time process_title('TCOG')
In [21]:
spelling_dictionary = refresh_dictionary()
%time process_title('TMM')
In [22]:
spelling_dictionary = refresh_dictionary()
%time process_title('WMH')
In [23]:
spelling_dictionary = refresh_dictionary()
%time process_title('YI')
In [24]:
# %load shared_elements/system_info.py
import IPython
print (IPython.sys_info())
!pip freeze
In [ ]: