calculate-error-rates-round-4

Table of Contents

In [1]:
from os import listdir
from os.path import isfile, join
import csv
import datetime
import nltk
from nltk.corpus import words
from nltk import word_tokenize
import re
In [2]:
input_dir = "/Users/jeriwieringa/Dissertation/text/text-current/2016-11-16-corpus-with-preliminary-cleaning/"
out_dir = "/Users/jeriwieringa/Dissertation/drafts/data/spelling-statistics/round4/"
word_list_dir = "/Users/jeriwieringa/Dissertation/drafts/data/word-lists/"
In [3]:
corpus = [f for f in listdir(input_dir) if not f.startswith('.') and isfile(join(input_dir, f))]
In [4]:
titles = ["ADV", "AmSn", "ARAI", "CE", "CUV", "EDU", "GCB", "GH", "GOH", "GS", "HM", "HR", 
          "IR", "LB", "LH", "LibM", "LUH", "NMN","PHJ","PTAR","PUR","RH","Sligo","SOL",
          "ST","SUW","TCOG","TMM","WMH","YI"]
In [5]:
# Function for pulling words from a txt file

def load_from_txt(file_name):
    with open(file_name, "r") as txt:
        words = txt.read().splitlines()
        word_list = [w.lower() for w in words]
    return(word_list)
In [6]:
generic_list = load_from_txt(join(word_list_dir, '2016-12-06-First-SDA-Word-List.txt'))
person_names = load_from_txt(join(word_list_dir, '2016-12-07-SDA-last-names.txt'))
place_names = load_from_txt(join(word_list_dir, '2016-12-07-SDA-place-names.txt'))
place_names_2 = load_from_txt(join(word_list_dir, '2017-01-03-place-names.txt'))
sda_words = load_from_txt(join(word_list_dir, '2016-12-08-SDA-Vocabulary.txt'))
In [7]:
spelling_dictionary = list(set(generic_list + person_names + place_names + place_names_2 + sda_words))
In [8]:
def refresh_dictionary():
    
    sda_words = load_from_txt(join(word_list_dir, '2016-12-08-SDA-Vocabulary.txt'))
    
    return(list(set(generic_list + person_names + place_names + place_names_2 + sda_words)))
In [9]:
def check_words(text, filename, spell_dictionary):
    
    # Clean 1: 
    ''' 
    Replace punctuation with a space to avoid attaching line ending errors to words. 
    Remove the '-' of hyphenated words. This allows me to check value of 
    each part of the combined word, without having to expand the dictionary too much. 
    Also allows for greater variability in the construction of hyphenated words 
    (as was often the case in 19th century writing.)
    '''
    text_cleaned = re.sub(r"[0-9,.!?$:;]\|", " ", text)
    
    # Special line for dashes to account for the variety of encodings
    
    text_cleaned = re.sub(r"[-—–‑]", " ", text_cleaned)
    
    # Clean 2:
    ''' 
    Correct occurances of wordsõ and wordõs to words' and word's. This pattern is seen in ADV, HR, and SUW.
    õ does not occur as a spelling error in the other periodical titles. However this pattern should prevent a 
    too-greedy clearing out of the character.
    '''
    text_cleaned = re.sub(r"(\w+)(õ|Õ)", r"\1'", text_cleaned)
    
    # Clean 3:
    ''' 
    Correct for names that have run together (and possibly some phrases) by identifying words with capitalization 
    in the middle, isolating the capitals, and adding a space.
    Solution from stack overflow: http://stackoverflow.com/questions/1097901/
    '''
    text_cleaned = re.sub(r"((?<=[a-z])[A-Z]|[A-Z](?=[a-z]))", r" \1", text_cleaned)
    
    # Clean 4: 
    '''
    Remove all non-alpha characters. As these can be found in the middle of words, I am not replacing them with a 
    space, as I did above. Leave "'" for possessives and contractions.
    '''
    text_cleaned = re.sub(r"[^\s[a-zA-Z'’]", "", text_cleaned)
    
    tokens = word_tokenize(text_cleaned)
    tokens_lower = [w.lower() for w in tokens]
    
#     print(tokens_lower)
    
    errors = set(tokens_lower)-set(spelling_dictionary)    
    
    freq_distribution = nltk.FreqDist(tokens_lower)
    
    error_report = {}
    
    error_total = 0
    for error in list(errors):
        error_count = freq_distribution[error]
        error_total = error_total + error_count
        error_report.update({error:error_count})
                          
    overview = {'doc_id': filename, 'num_tokens': len(tokens), 'num_errors': error_total, 'errors': error_report}
     
    return(overview)
In [10]:
def process_texts(title):
    statistics = []
    for filename in corpus:
        if filename.startswith(title):
#             print(filename)
            with open(input_dir + filename, "r") as f:
                
                content = f.read()
                
                stats = check_words(content, filename, spelling_dictionary)
                statistics.append(stats)
    return(statistics)
In [11]:
def test_process(file):
    with open(input_dir + file, "r") as f:
        print(file)
        content = f.read()
        print(content)
        stats = check_words(content, file, spelling_dictionary)
        print("Errors: {}".format(stats['errors']))
        print(stats)
In [12]:
test_process('AmSn18910402-V06-14-page1.txt')
AmSn18910402-V06-14-page1.txt
 VOLUME 6.
Equal and exact justice to all men, of whatever state or persuasion, religious or political.ÑThomas .7eferson. NEW YORK, APRIL 2, 1891.
NUMBER 14.
The Ametrican Sentinel. PUBLISHED WEEKLY, BY THE
PACIFIC PRESS PUBLISHING, COMPANY,
No. 43 Bow ST., NEW YORK.
Entered at the New York Post Office as Second Class Matter.
EDITOR, - - - ALONZO T. JONES.
ASSOCIATE EDITORS,
during the summer. If they decline to do so at other times of the year they violate no law. . .
If employers do not see fit to observe the half-holiday in their own business there is nowhere any authority to compel them.
We have italicized one sentence in the foregoing to call attention to the central thought in it, namely, that " there is, no power in the Legislature to enact " a law compelling suspension of business on Saturday afternoon. This we believe to be the truth. Then how is it' that laws are not only passed but are sustained by the courts compelling suspension of business on Sunday ?
¥¥
What Is the Guide to Morality?
AT the end of his discussion of the subject of "Ethics for Schools," Mr. Bierbower come to " conscientiousness." In fact this point is touched upon in the very beginning of the introduction of the book, so that the beginning and the end, the first and the last, deals with the question of conscience. In stating " the ground of right," the second paragraph in the book says :Ñ
We recognize right by our judgment of what is best, and by a feelingÑconscienceÑwhich indicates, as the result of many impressions, what we ought to do, and impels us thereto.
And the last chapter of the book begins with the following paragraph
The most general rule of morality is to do what you believe right and good, and to preserve the perpetual consciousness of,this by instantly performing your duty, when seen. Goodness is simple when thus reduced to one rule. For you have but to look at your conscience to see your duty, conscience being the sense of what we ought to de, which results from all our thought and information on the subject. ÑPage 283.
¥ This ground of right is just as treacherous as that which was discovered in the previous article on this subject; in fact, it is the same thing only stated in other words; yet as it enters the realm of conscience it touches the real ground of supreme right, and ultimate good. If conscience were a true guide, then this rule would be good enough; but conscience is not a true guide. Conscience as a guide
is as erratic as any other faculty in man. The truth is that conscience itself must be guided. This is admitted by the book now under notice. One statement to this effect is as follows :Ñ
C. P. BOLLMAN,
W. IL Mc KEE.
¥
It is important then in taking conscience as a
guide, to have it in working order.ÑPage 284.
Yes, we should naturally suppose so. Any kind of an instrument that is not in working order is not of much use; and especially in questions of conscience and of ultimate right. And in this case even to think of taking as a guide an instrument that could ever by any possibility get out of working order, seems a most singular suggestion. Another statement to the same purpose is as follows :Ñ
We can not do right to-day on yesterday's wrongs ; so that men should often straighten out their conscience to get its legitimate indications. ÑPage 28.4.
And again:Ñ
Inspect your conscience as well as your observance of it, or, rather, look after your views of right as well as your conformity thereto. ÑPage 290.
Of what use is a rule of right which goes so much awry and becomes so easily kinked that it needs " often " to be straightened out ? And, of what use is a guide that has to be held up for inspection every little while ?
Again we read :Ñ
Though conscience may err, it is the best judgment we haveÑthe pointing of the compass after all the conflicting forces which would diversely impel us, and so coming of our knowledge to a head in the will.ÑPage 283.
With how much certainty can a compass be depended upon which not only may, but confessedly does, often point the wrong way ? What insurance company or ship owner would send a ship to sea with such a compass as that ? What captain or sailor would think of starting to sea with such a compass ? The strangest part of this whole systein of ethics, is that conscience would be recommended as a guide, when it is stated repeatedly not only that it may err, but that it does err; often.
There is another question which arises here. How is conscience to be inspected ? - Who is to conduct the inspection ? Who
READING, writing, arithmetic, and geography are not, taught differently by a Methodist and by a Jesuit, but in precisely the same fashion, if they are taught properly. TO say that a " godless " instruction in these branches of knowledge, or in any others that are properly within the province' of the public schools, is " necessarily immoral," is to make a perfectly meaningless assertion.ÑNew York Times.
T E Christian religion made its way into the world in -'opposition to all human governments. Banishment, tortures, and death were inflicted in vain to stop its progress. But 'many of its professors, as soon as clothed with political power, lost the meek spirit which their creed inculcated, and began to inflict on other religions, and on dissenting sects of their own religion, persecutions more aggravated than those which their own apostles had endured.ÑCol. Richard M. Johnson.
¥¥
OF the SaturdaY, half-holiday and the
law, which makes it such, the says:Ñ
World
It is difficult to see what can be done, apart_ from persuasion, to secure a more general observance of the Saturday half-holiday. The law does not compel any merchant or Shop-keeper orLfacteri owner to Clime his establishment on Saturday afternoon. Indeed, there is no power in the Legislature to enact suck a law. All that can be doneby statute is done. The law makes Saturday 'afterneon a half-holiday. It compels banks to close, renders the protest of notes and the like illegal, and thus makes it less easy than it would otherWiee be for general business to 'go on. a Consegtience most large, and many small, establishments close at noon on Saturday

Errors: {'il': 1, 't': 2, 'systein': 1, "'go": 1, "'opposition": 1, 'doneby': 1, 'bierbower': 1, 'c': 1, 'eferson': 1, 'w': 1, 'feelingconsciencewhich': 1, 'ofthis': 1, "'s": 1, 'mc': 1, 'consegtience': 1, 'e': 1, 'lfacteri': 1, 'kee': 1, 'wiee': 1, 'saturda': 1, "'afterneon": 1, 'havethe': 1, "'many": 1, 'ametrican': 1, "'": 2, 'm': 1}
{'errors': {'il': 1, 't': 2, 'systein': 1, "'go": 1, "'opposition": 1, 'doneby': 1, 'bierbower': 1, 'c': 1, 'eferson': 1, 'w': 1, 'feelingconsciencewhich': 1, 'ofthis': 1, "'s": 1, 'mc': 1, 'consegtience': 1, 'e': 1, 'lfacteri': 1, 'kee': 1, 'wiee': 1, 'saturda': 1, "'afterneon": 1, 'havethe': 1, "'many": 1, 'ametrican': 1, "'": 2, 'm': 1}, 'doc_id': 'AmSn18910402-V06-14-page1.txt', 'num_errors': 28, 'num_tokens': 1080}
In [13]:
from collections import Counter
import csv

def process_title(title):
    print(title)
        
    statistics = process_texts(title)
    
    # Get summary statistics on the errors

    all_errors = [report['errors'] for report in statistics]

    inp = [dict(x) for x in all_errors]
    
    errors_summary = Counter()
    for y in inp:
        errors_summary += Counter(y)

    with open("{}{}-corpus-spelling-errors-round-4-{}.csv".format(out_dir, str(datetime.date.today()), title), 
              "w") as csv_file:
        fieldnames = ['doc_id', 'num_tokens', 'num_errors', 'errors']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        
        for stats in statistics:
            writer.writerow(stats)
    
    # Save error counts for each periodical title
    
    with open("{}{}-Spelling-Errors-{}.txt".format(out_dir, str(datetime.date.today()), title), "w") as outfile:
        fieldnames2 = ['spell_error', 'count']
        writer2 = csv.writer(outfile)
        writer2.writerow(fieldnames2)
        
        for key,value in errors_summary.items():
            writer2.writerow([key, value])  
In [30]:
%time process_title('ADV')
ADV
CPU times: user 1min 1s, sys: 3.34 s, total: 1min 4s
Wall time: 1min 8s
In [31]:
%time process_title('AmSn')
AmSn
CPU times: user 3min 53s, sys: 11 s, total: 4min 4s
Wall time: 4min 16s
In [32]:
%time process_title('ARAI')
ARAI
CPU times: user 2.67 s, sys: 160 ms, total: 2.83 s
Wall time: 2.96 s
In [36]:
%time process_title('CE')
CE
CPU times: user 1min 29s, sys: 4.5 s, total: 1min 33s
Wall time: 1min 42s
In [39]:
%time process_title('CUV')
CUV
CPU times: user 3min 8s, sys: 8.65 s, total: 3min 16s
Wall time: 3min 25s
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('EDU')
EDU
CPU times: user 11.5 s, sys: 792 ms, total: 12.3 s
Wall time: 12.7 s
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('GCB')
GCB
CPU times: user 2min 9s, sys: 8.25 s, total: 2min 17s
Wall time: 2min 25s
In [16]:
spelling_dictionary = refresh_dictionary()
%time process_title('GH')
GH
CPU times: user 54.4 s, sys: 2.72 s, total: 57.1 s
Wall time: 1min
In [17]:
spelling_dictionary = refresh_dictionary()
%time process_title('GOH')
GOH
CPU times: user 19.6 s, sys: 1.13 s, total: 20.7 s
Wall time: 21.4 s
In [18]:
spelling_dictionary = refresh_dictionary()
%time process_title('GS')
GS
CPU times: user 20.5 s, sys: 817 ms, total: 21.3 s
Wall time: 22.2 s
In [13]:
spelling_dictionary = refresh_dictionary()
%time process_title('HM')
HM
CPU times: user 1min 6s, sys: 4.65 s, total: 1min 11s
Wall time: 1min 15s
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('HR')
HR
CPU times: user 11min 4s, sys: 27.4 s, total: 11min 31s
Wall time: 12min
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('IR')
IR
CPU times: user 34.9 s, sys: 2.72 s, total: 37.7 s
Wall time: 42.2 s
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('LB')
LB
CPU times: user 3min 38s, sys: 11.1 s, total: 3min 49s
Wall time: 4min 2s
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('LH')
LH
CPU times: user 4min 3s, sys: 13.2 s, total: 4min 16s
Wall time: 4min 29s
In [16]:
spelling_dictionary = refresh_dictionary()
%time process_title('LibM')
LibM
CPU times: user 1min, sys: 4.07 s, total: 1min 4s
Wall time: 1min 6s
In [17]:
spelling_dictionary = refresh_dictionary()
%time process_title('LUH')
LUH
CPU times: user 2min 41s, sys: 7.76 s, total: 2min 49s
Wall time: 2min 58s
In [18]:
spelling_dictionary = refresh_dictionary()
%time process_title('NMN')
NMN
CPU times: user 7.88 s, sys: 407 ms, total: 8.28 s
Wall time: 8.59 s
In [19]:
spelling_dictionary = refresh_dictionary()
%time process_title('PHJ')
PHJ
CPU times: user 1min 43s, sys: 4.9 s, total: 1min 48s
Wall time: 1min 52s
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('PTAR')
PTAR
CPU times: user 5.84 s, sys: 300 ms, total: 6.14 s
Wall time: 6.52 s
In [14]:
spelling_dictionary = refresh_dictionary()
%time process_title('PUR')
PUR
CPU times: user 3min 50s, sys: 21.2 s, total: 4min 11s
Wall time: 4min 23s
In [15]:
spelling_dictionary = refresh_dictionary()
%time process_title('RH')
RH
CPU times: user 1h 57min 26s, sys: 3min 20s, total: 2h 46s
Wall time: 2h 6min 19s
In [16]:
spelling_dictionary = refresh_dictionary()
%time process_title('Sligo')
Sligo
CPU times: user 27.1 s, sys: 2.71 s, total: 29.9 s
Wall time: 30.6 s
In [17]:
spelling_dictionary = refresh_dictionary()
%time process_title('SOL')
SOL
CPU times: user 60 s, sys: 5.57 s, total: 1min 5s
Wall time: 1min 9s
In [18]:
spelling_dictionary = refresh_dictionary()
%time process_title('ST')
ST
CPU times: user 18min 22s, sys: 1min 2s, total: 19min 24s
Wall time: 20min 2s
In [19]:
spelling_dictionary = refresh_dictionary()
%time process_title('SUW')
SUW
CPU times: user 2min 36s, sys: 15.1 s, total: 2min 51s
Wall time: 2min 55s
In [20]:
spelling_dictionary = refresh_dictionary()
%time process_title('TCOG')
TCOG
CPU times: user 47.8 s, sys: 4.57 s, total: 52.4 s
Wall time: 54 s
In [21]:
spelling_dictionary = refresh_dictionary()
%time process_title('TMM')
TMM
CPU times: user 53.4 s, sys: 5.41 s, total: 58.8 s
Wall time: 1min
In [22]:
spelling_dictionary = refresh_dictionary()
%time process_title('WMH')
WMH
CPU times: user 32.8 s, sys: 3.27 s, total: 36.1 s
Wall time: 38.3 s
In [23]:
spelling_dictionary = refresh_dictionary()
%time process_title('YI')
YI
CPU times: user 12min 26s, sys: 53.7 s, total: 13min 20s
Wall time: 13min 47s
In [24]:
# %load shared_elements/system_info.py
import IPython
print (IPython.sys_info())
!pip freeze
{'commit_hash': '5c9c918',
 'commit_source': 'installation',
 'default_encoding': 'UTF-8',
 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython',
 'ipython_version': '5.1.0',
 'os_name': 'posix',
 'platform': 'Darwin-16.3.0-x86_64-i386-64bit',
 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python',
 'sys_platform': 'darwin',
 'sys_version': '3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, '
                '17:52:12) \n'
                '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'}
anaconda-client==1.5.5
appnope==0.1.0
argh==0.26.1
blinker==1.4
bokeh==0.12.3
boto==2.43.0
bz2file==0.98
chest==0.2.3
cloudpickle==0.2.1
clyent==1.2.2
dask==0.12.0
datashader==0.4.0
datashape==0.5.2
decorator==4.0.10
docutils==0.12
doit==0.29.0
gensim==0.12.4
Ghost.py==0.2.3
ghp-import2==1.0.1
gspread==0.4.1
HeapDict==1.0.0
httplib2==0.9.2
husl==4.0.3
ipykernel==4.5.2
ipython==5.1.0
ipython-genutils==0.1.0
ipywidgets==5.2.2
Jinja2==2.8
jsonschema==2.5.1
jupyter==1.0.0
jupyter-client==4.4.0
jupyter-console==5.0.0
jupyter-contrib-core==0.3.0
jupyter-contrib-nbextensions==0.2.2
jupyter-core==4.2.1
jupyter-highlight-selected-word==0.0.5
jupyter-latex-envs==1.3.5.4
jupyter-nbextensions-configurator==0.2.3
llvmlite==0.14.0
locket==0.2.0
Logbook==1.0.0
lxml==3.5.0
MacFSEvents==0.7
Mako==1.0.4
Markdown==2.6.7
MarkupSafe==0.23
mistune==0.7.3
multipledispatch==0.4.9
natsort==4.0.4
nb-anacondacloud==1.2.0
nb-conda==2.0.0
nb-conda-kernels==2.0.0
nb-config-manager==0.1.3
nbbrowserpdf==0.2.1
nbconvert==4.2.0
nbformat==4.2.0
nbpresent==3.0.2
networkx==1.11
Nikola==7.7.7
nltk==3.2.1
notebook==4.2.3
numba==0.29.0
numpy==1.11.2
oauth2client==4.0.0
odo==0.5.0
pandas==0.19.1
partd==0.3.6
path.py==0.0.0
pathtools==0.1.2
pexpect==4.0.1
pickleshare==0.7.4
Pillow==3.4.2
prompt-toolkit==1.0.9
psutil==4.3.0
ptyprocess==0.5.1
pyasn1==0.1.9
pyasn1-modules==0.0.8
pycrypto==2.6.1
Pygments==2.1.3
PyPDF2==1.25.1
PyRSS2Gen==1.1
pyshp==1.2.10
python-dateutil==2.6.0
pytz==2016.10
PyYAML==3.12
pyzmq==16.0.2
qtconsole==4.2.1
requests==2.12.3
rsa==3.4.2
scipy==0.18.1
simplegeneric==0.8.1
six==1.10.0
smart-open==1.3.5
terminado==0.6
textblob==0.11.1
toolz==0.8.1
tornado==4.4.2
traitlets==4.3.1
Unidecode==0.4.19
watchdog==0.8.3
wcwidth==0.1.7
webassets==0.11.1
widgetsnbextension==1.2.6
ws4py==0.3.4
xarray==0.8.2
Yapsy==1.11.223
In [ ]: