create-scriptural-word-list
In [1]:
import csv
from nltk import word_tokenize
import os.path
import re
In [2]:
_dir = "/Users/jeriwieringa/Dissertation/drafts/data/"
In [3]:
"""The source of the kjv file is the Christian Classics Ethereal Library (https://www.ccel.org/ccel/bible/kjv.txt).
I removed the index and URL links from the end of the provided file before processing.
"""
file = "kjv.txt"
In [4]:
def process_text(text):
text_cleaned = re.sub(r"[^a-zA-Z]", " ", text)
tokens = word_tokenize(text_cleaned)
tokens_lower = [w.lower() for w in tokens]
return(list(set(tokens_lower)))
In [5]:
with open(os.path.join(_dir, file), "r") as f:
print(file)
content = f.read()
unique_words = process_text(content)
print(unique_words[2])
print(type(unique_words))
with open(os.path.join(_dir, "word-lists", "2017-05-24-kjv-wordlist.txt"), "w") as outfile:
for each in unique_words:
outfile.write("{}\n".format(each))
In [6]:
# %load "shared_elements/system_info.py"
import IPython
print (IPython.sys_info())
!pip freeze
In [ ]: