create-scriptural-word-list

In [1]:
import csv
from nltk import word_tokenize
import os.path
import re
In [2]:
_dir = "/Users/jeriwieringa/Dissertation/drafts/data/"
In [3]:
"""The source of the kjv file is the Christian Classics Ethereal Library (https://www.ccel.org/ccel/bible/kjv.txt). 
I removed the index and URL links from the end of the provided file before processing.
"""
file = "kjv.txt"
In [4]:
def process_text(text):
    text_cleaned = re.sub(r"[^a-zA-Z]", " ", text)
    tokens = word_tokenize(text_cleaned)
    tokens_lower = [w.lower() for w in tokens]
    return(list(set(tokens_lower)))
In [5]:
with open(os.path.join(_dir, file), "r") as f:
    print(file)
    content = f.read()
    unique_words = process_text(content)
    print(unique_words[2])
    print(type(unique_words))
    with open(os.path.join(_dir, "word-lists", "2017-05-24-kjv-wordlist.txt"), "w") as outfile:
        for each in unique_words:
            outfile.write("{}\n".format(each))
kjv.txt
biatas
<class 'list'>
In [6]:
# %load "shared_elements/system_info.py"
import IPython
print (IPython.sys_info())
!pip freeze
{'commit_hash': '51ce9d73b',
 'commit_source': 'installation',
 'default_encoding': 'UTF-8',
 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/GoH/lib/python3.5/site-packages/IPython',
 'ipython_version': '6.0.0',
 'os_name': 'posix',
 'platform': 'Darwin-16.6.0-x86_64-i386-64bit',
 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/GoH/bin/python',
 'sys_platform': 'darwin',
 'sys_version': '3.5.3 |Continuum Analytics, Inc.| (default, Mar  6 2017, '
                '12:15:08) \n'
                '[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]'}
anaconda-client==1.6.2
appnope==0.1.0
beautifulsoup4==4.5.3
bleach==1.5.0
bokeh==0.12.5
clyent==1.2.2
decorator==4.0.11
entrypoints==0.2.2
Ghost.py==0.2.3
html5lib==0.999
ipykernel==4.6.1
ipython==6.0.0
ipython-genutils==0.2.0
ipywidgets==6.0.0
Jinja2==2.9.6
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.0.1
jupyter-console==5.1.0
jupyter-contrib-core==0.3.1
jupyter-contrib-nbextensions==0.2.7
jupyter-core==4.3.0
jupyter-highlight-selected-word==0.0.11
jupyter-latex-envs==1.3.8.2
jupyter-nbextensions-configurator==0.2.4
lxml==3.7.3
MarkupSafe==0.23
mistune==0.7.4
nb-anacondacloud==1.2.0
nb-conda==2.0.0
nb-conda-kernels==2.0.0
nb-config-manager==0.1.3
nbbrowserpdf==0.2.1
nbconvert==5.1.1
nbformat==4.3.0
nbpresent==3.0.2
nltk==3.2.2
notebook==5.0.0
numpy==1.12.1
pandas==0.19.2
pandocfilters==1.4.1
pexpect==4.2.1
pickleshare==0.7.4
prompt-toolkit==1.0.14
psutil==5.2.1
ptyprocess==0.5.1
Pygments==2.2.0
PyPDF2==1.26.0
python-dateutil==2.6.0
pytz==2017.2
PyYAML==3.12
pyzmq==16.0.2
qtconsole==4.3.0
requests==2.13.0
simplegeneric==0.8.1
six==1.10.0
terminado==0.6
testpath==0.3
tornado==4.4.2
traitlets==4.3.2
wcwidth==0.1.7
wget==2.2
widgetsnbextension==2.0.0
In [ ]: