Moving the generation of noun phrases out of the streaming process, as it takes about 2 hours.

In [1]:
from textblob import TextBlob
import gensim
from gensim.parsing.preprocessing import STOPWORDS
import os
import sys
import re
import tarfile
import itertools
import logging
import nltk

In [2]:
# %load ../shared_elements/logging.py
# http://stackoverflow.com/questions/35936086/
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# ch.setLevel(logging.DEBUG)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

In [3]:
def process_page(page):
    """
    Preprocess a single periodical page, returning the result as
    a unicode string.
    """
    content = gensim.utils.to_unicode(page, 'utf8').strip()

    """
    Cleans up the special characters in the text to those we would expect in the corpus.
    Leaves punctuation, which may result in additional noise. 
    Removes all accented characters. There is a higher rate of messy OCR reporting 
    accented characters than use in this corpus of languages other than English. 
    This approach removes from study questions of non-English language use, but
    significantly reduces OCR noise.
    """
    content = re.sub(r"[^a-zA-Z]", " ", content)
    
    return content

In [4]:
def iter_Periodicals(fname, log_every=None):
    """
    Yield plain text of each periodical page, as a unicode string.

    The pages are read from the directory `` on disk.
    (e.g. `/`)

    """
    extracted = 0
    with tarfile.open(fname, 'r:gz') as tf:
        for file_number, file_info in enumerate(tf):
            if file_info.isfile():
                if log_every and extracted % log_every == 0:
                    logging.info("extracting file #%i: %s" % (extracted, file_info.name))
                content = tf.extractfile(file_info).read()
                yield process_page(content)
                extracted += 1

In [5]:
def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))

def best_phrases(document_stream, top_n=2000, prune_at=100000):
    """Return a set of `top_n` most common noun phrases."""
    np_counts = {}
    for docno, doc in enumerate(document_stream):
        # prune out infrequent phrases from time to time, to save RAM.
        # the result may not be completely accurate because of this step
        if docno % 1000 == 0:
            sorted_phrases = sorted(np_counts.items(), key=lambda item: -item[1])
            np_counts = dict(sorted_phrases[:prune_at])
            logging.info("at document #%i, considering %i phrases: %s..." %
                         (docno, len(np_counts), head(sorted_phrases)))
        
        # how many times have we seen each noun phrase?
        for np in TextBlob(doc).noun_phrases:
            # only consider multi-word NEs where each word contains at least one letter
            if u' ' not in np:
                continue
            # ignore phrases that contain too short/non-alphabetic words
            if all(word.isalpha() and len(word) > 2 for word in np.split()):
                np_counts[np] = np_counts.get(np, 0) + 1

    sorted_phrases = sorted(np_counts, key=lambda np: -np_counts[np])
    return set(head(sorted_phrases, top_n))

In [6]:
def corpus_ne_phrases(corpus):
    logging.info("collecting entities from %s" % corpus)
    doc_stream = iter_Periodicals(corpus)
    entities = best_phrases(doc_stream)
    logging.info("selected %i entities: %s..." %
                 (len(entities), list(entities)[:10]))

    return entities

In [7]:
corpus = '/Users/jeriwieringa/Dissertation/text/text/2017-04-docs-for-whole-corpus-model.tar.gz'

In [8]:
phrases = corpus_ne_phrases(corpus)

root - INFO - collecting entities from /Users/jeriwieringa/Dissertation/text/text/2017-04-docs-for-whole-corpus-model.tar.gz
root - INFO - at document #0, considering 0 phrases: []...
root - INFO - at document #1000, considering 21082 phrases: [('the advocate', 549), ('christian education', 365), ('church school', 264), ('church schools', 234), ('public schools', 141), ('young people', 138), ('school work', 130), ('young men', 124), ('educational work', 112), ('christian schools', 110)]...
root - INFO - at document #2000, considering 49991 phrases: [('the advocate', 907), ('christian education', 518), ('public schools', 423), ('church school', 334), ('church schools', 268), ('civil government', 257), ('young people', 246), ('christian religion', 243), ('american sentinel', 224), ('school work', 222)]...
root - INFO - at document #3000, considering 80276 phrases: [('public schools', 1193), ('the advocate', 908), ('civil government', 855), ('christian religion', 560), ('christian educati

In [9]:
phrases

{'creek michigan',
 'large increase',
 'precious light',
 'active interest',
 'good home',
 'strong drink',
 'universal postal',
 'whole earth',
 'open doors',
 'perfect obedience',
 'american people',
 'god rev',
 'good effect',
 'medical school',
 'dead man',
 'christ jesus',
 'good number',
 'past summer',
 'whole day',
 'instructor vol',
 'class work',
 'city work',
 'general government',
 'good way',
 'twentieth century',
 'eld haskell',
 'honest souls',
 'mighty work',
 'definite time',
 'present condition',
 'fiery furnace',
 'modern science',
 'different times',
 'camp ground',
 'thy kingdom',
 'possible way',
 'creek mich',
 'way home',
 'certain extent',
 'school officers',
 'various phases',
 'god cor',
 'roman catholic church',
 'new man',
 'christian work',
 'self exaltation',
 'manual training',
 'good words',
 'sabbath july',
 'prominent part',
 'advent review and sabbath herald vol',
 'field secretary',
 'animal life',
 'great majority',
 'cold bath',
 'red sea',
 'heal

In [11]:
with open("/Users/jeriwieringa/Dissertation/drafts/data/module-3/2017-04-noun-phrases-2000.txt", "w") as o:
    for phrase in list(phrases):
        o.write("{}\n".format(phrase))
o.close()

In [12]:
# %load ../shared_elements/system_info.py
import IPython
print (IPython.sys_info())
!pip freeze

{'commit_hash': '5c9c918',
 'commit_source': 'installation',
 'default_encoding': 'UTF-8',
 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython',
 'ipython_version': '5.1.0',
 'os_name': 'posix',
 'platform': 'Darwin-16.5.0-x86_64-i386-64bit',
 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python',
 'sys_platform': 'darwin',
 'sys_version': '3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, '
                '17:52:12) \n'
                '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'}
alabaster==0.7.10
anaconda-client==1.5.5
appdirs==1.4.3
appnope==0.1.0
argh==0.26.1
Babel==2.3.4
beautifulsoup4==4.5.3
blinker==1.4
bokeh==0.12.4
boto==2.43.0
brewer2mpl==1.4.1
bz2file==0.98
chest==0.2.3
cleanOCR==0.1
cloudpickle==0.2.2
clyent==1.2.2
cycler==0.10.0
dask==0.12.0
datashader==0.4.0
datashape==0.5.2
decorator==4.0.11
docutils==0.13.1
doit==0.30.3
gensim==0.12.4
geoplotlib==0.3.2
ggplot==0.11.5
Ghos