create-SDA-people-and-places-lists

In [1]:
import gspread
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials
import datetime
import numpy as np
In [2]:
scope = ['https://spreadsheets.google.com/feeds']
In [3]:
credentials = ServiceAccountCredentials.from_json_keyfile_name('secrets/dissertation.json', scope)
In [4]:
gc = gspread.authorize(credentials)
In [5]:
sheets = ['YB1883', 'YB1885', 'YB1890', 'YB1894', 
          'YB1905', 'YB1910', 'YB1915', 'YB1920']
In [6]:
def read_in_sheet(sheet):
    dts = gc.open(sheet).sheet1
    frame = pd.DataFrame(dts.get_all_records())
    return(frame)
In [7]:
df = pd.DataFrame()
In [8]:
for sheet in sheets:
    frame = read_in_sheet(sheet)
    df = df.append(frame, ignore_index=True)
In [9]:
df
Out[9]:
conference gender group institution-name lastname location name organization page position position-information prefix region suffix yearbook-year
0 General m board of trustees Educational Society Butler Battle Creek, MI Geo. I. 7 trustee Michigan 1883
1 General m board of trustees Educational Society Haskell South Lancaster, MA S.N. 7 trustee Michigan 1883
2 General m board of trustees Educational Society Henry Battle Creek, MI A.R. 7 trustee Michigan 1883
3 General m board of trustees Educational Society Kellogg Battle Creek, MI J.H. 7 trustee Michigan M.D. 1883
4 General m board of trustees Educational Society Kellogg Battle Creek, MI H.W. 7 trustee Michigan 1883
5 General m board of trustees Educational Society Oyen Battle Creek, MI A.B. 7 trustee Michigan 1883
6 General m board of trustees Educational Society Sisley Battle Creek, MI W.C. 7 trustee Michigan 1883
7 General m board of directors Health Reform Institute Fargo Greenville, MI J. 7 director Michigan 1883
8 General m board of directors Health Reform Institute Hall Battle Creek, MI W.H. 7 director Michigan 1883
9 General m board of directors Health Reform Institute Hall Battle Creek, MI L.M. 7 director Michigan 1883
10 General m board of directors Health Reform Institute Haskell South Lancaster, MA S.N. 7 director Michigan 1883
11 General m board of directors Health Reform Institute Henry Battle Creek, MI A.R. 7 director Michigan 1883
12 General m board of directors Health Reform Institute Kellogg Battle Creek, MI J.H. 7 director Michigan M.D. 1883
13 General m board of directors Health Reform Institute Murphy Battle Creek, MI G.H. 7 director Michigan 1883
14 General m Pacific Seventh-day Adventist Publishing Assoc... Glenn Oakland, CA W.N. 7 auditor 1883
15 General m director Pacific Seventh-day Adventist Publishing Assoc... Haskell South Lancaster, MA S.N. 7 director 1883
16 General m director Pacific Seventh-day Adventist Publishing Assoc... Israel Oakland, CA M.C. 7 director 1883
17 General m director Pacific Seventh-day Adventist Publishing Assoc... Jones Oakland, CA C.H. 7 director 1883
18 General m director Pacific Seventh-day Adventist Publishing Assoc... Morrison Healdsburg, CA J. 7 director 1883
19 General m director Pacific Seventh-day Adventist Publishing Assoc... White Oakland, CA W.C. 7 director 1883
20 General m publishing committee Pacific Seventh-day Adventist Publishing Assoc... Haskell South Lancaster, MA S.N. 7 committee member 1883
21 General m publishing committee Pacific Seventh-day Adventist Publishing Assoc... Waggoner Oakland, CA J.H. 7 committee member 1883
22 General m publishing committee Pacific Seventh-day Adventist Publishing Assoc... White Oakland, CA W.C. 7 committee member 1883
23 General m Pacific Seventh-day Adventist Publishing Assoc... Haskell South Lancaster, MA S.N. 7 president 1883
24 General m Pacific Seventh-day Adventist Publishing Assoc... Vickery Oakland, CA W.K. 7 secretary 1883
25 General m Pacific Seventh-day Adventist Publishing Assoc... White Oakland, CA W.C. 7 vice president 1883
26 General m Seventh-day Adventist Publishing Association Sisley Battle Creek, MI W.C. 6 auditor Michigan 1883
27 General m publishing committee Seventh-day Adventist Publishing Association Amadon Battle Creek, MI G.W. 6 committee member Michigan 1883
28 General m publishing committee Seventh-day Adventist Publishing Association Butler Battle Creek, MI Geo. I. 6 committee member Michigan 1883
29 General m publishing committee Seventh-day Adventist Publishing Association Smith Battle Creek, MI U. 6 committee member Michigan 1883
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
15053 board of trustees Loma Linda Sanitarium Martin W.F. 265, 227 trustee California 1920
15054 board of trustees Loma Linda Sanitarium Reaser G.W. 265, 227 trustee California 1920
15055 board of trustees Loma Linda Sanitarium Risley E.H. 265, 227 trustee California 1920
15056 board of trustees Loma Linda Sanitarium Bowen L.M. 265, 227 trustee California 1920
15057 board of trustees Loma Linda Sanitarium McElhany J.L. 265, 227 trustee California 1920
15058 board of trustees Loma Linda Sanitarium Evans Newton 265, 227 trustee California 1920
15059 board of trustees Loma Linda Sanitarium Vollmer H.W. 265, 227 trustee California 1920
15060 board of trustees Loma Linda Sanitarium Magan P.T. 265, 227 trustee California 1920
15061 board of trustees Loma Linda Sanitarium Weir J.J. 265, 227 trustee California 1920
15062 board of trustees Loma Linda Sanitarium Christian J.W. 265, 227 trustee California 1920
15063 officers of the board Loma Linda Sanitarium Evans Newton 265, 227 vice president California 1920
15064 officers St. Helena Sanitarium Rice Helen N. California Medical Missionary and Benevolent A... 267-268 lady head nurse Miss California R.N. 1920
15065 officers St. Helena Sanitarium Rice Helen N. California Medical Missionary and Benevolent A... 267-268 secretary training-school Miss California R.N. 1920
15066 Columbia Union executive committee Booth A.S. 39, 40 committee member 1920
15067 Columbia Union executive committee Harter R.E. 39, 41 committee member 1920
15068 Columbia Union executive committee Parsons D.A. 39, 42 committee member 1920
15069 Columbia Union executive committee Moffet W.C. 39, 44 committee member 1920
15070 Columbia Union executive committee Leland J.A. 39, 47 committee member 1920
15071 Columbia Union executive committee Lindsay R.S. 39, 48 committee member 1920
15072 Columbia Union executive committee Westbrook T.B. 39, 49 committee member 1920
15073 Columbia Union executive committee Shaw Fred 48-49 committee member West Pennsylvania 1920
15074 Lake Union executive committee Clark A.J. 54, 55 committee member 1920
15075 Lake Union executive committee Westworth Wm. A. 54, 57 committee member 1920
15076 Lake Union executive committee Holden W.H. 54, 59 committee member 1920
15077 Lake Union executive committee Wiest C.S. 54, 60 committee member 1920
15078 Lake Union executive committee Nethery R.J. 54, 62 committee member 1920
15079 Lake Union executive committee Irwin J.J. 54, 63 committee member 1920
15080 Lake Union executive committee Bristol E.A. 54, 64 committee member 1920
15081 Lake Union executive committee Piper J.F. 54, 66 committee member 1920
15082 Pacific Union executive committee Adams W.M. 95-96 committee member Southern California 1920

15083 rows × 15 columns

In [10]:
last_names = df['lastname']
In [11]:
last_names_df = last_names.str.rsplit(' ', expand=True)
In [12]:
last_names_df
Out[12]:
0 1 2 3
0 Butler None None None
1 Haskell None None None
2 Henry None None None
3 Kellogg None None None
4 Kellogg None None None
5 Oyen None None None
6 Sisley None None None
7 Fargo None None None
8 Hall None None None
9 Hall None None None
10 Haskell None None None
11 Henry None None None
12 Kellogg None None None
13 Murphy None None None
14 Glenn None None None
15 Haskell None None None
16 Israel None None None
17 Jones None None None
18 Morrison None None None
19 White None None None
20 Haskell None None None
21 Waggoner None None None
22 White None None None
23 Haskell None None None
24 Vickery None None None
25 White None None None
26 Sisley None None None
27 Amadon None None None
28 Butler None None None
29 Smith None None None
... ... ... ... ...
15053 Martin None None None
15054 Reaser None None None
15055 Risley None None None
15056 Bowen None None None
15057 McElhany None None None
15058 Evans None None None
15059 Vollmer None None None
15060 Magan None None None
15061 Weir None None None
15062 Christian None None None
15063 Evans None None None
15064 Rice None None None
15065 Rice None None None
15066 Booth None None None
15067 Harter None None None
15068 Parsons None None None
15069 Moffet None None None
15070 Leland None None None
15071 Lindsay None None None
15072 Westbrook None None None
15073 Shaw None None None
15074 Clark None None None
15075 Westworth None None None
15076 Holden None None None
15077 Wiest None None None
15078 Nethery None None None
15079 Irwin None None None
15080 Bristol None None None
15081 Piper None None None
15082 Adams None None None

15083 rows × 4 columns

In [13]:
def df_to_list(df):
    headers = list(df.columns.values)
    list_out = []
    for header in headers:
        df[header].str.strip()
        col = df[header].tolist()
        for each in col:
            if str(each).isalpha() and len(str(each)) > 2:
                list_out.append(each)
            else:
                pass
    return(list_out)       
In [14]:
last_names_split = df_to_list(last_names_df)
In [15]:
out_dir = '/Users/jeriwieringa/Dissertation/drafts/data/word-lists/'
In [16]:
with open("{}{}-SDA-last-names.txt".format(out_dir,str(datetime.date.today())), "w") as f:
    for name in last_names_split:
        f.write("{}\n".format(name))
In [17]:
df2 = df
In [18]:
def drop_empty_rows(df, header):
    df[header].replace('', np.nan, inplace=True)
    df.dropna(subset=[header], inplace=True)
    return(df)
In [19]:
df2 = drop_empty_rows(df2, 'location')
In [20]:
places = df2['location']
In [21]:
places = places.str.replace(' ', ',')
In [22]:
df3 = places.str.split(',', expand=True)
In [23]:
df3
Out[23]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 Battle Creek MI None None None None None None None None None None None None
1 South Lancaster MA None None None None None None None None None None None None
2 Battle Creek MI None None None None None None None None None None None None
3 Battle Creek MI None None None None None None None None None None None None
4 Battle Creek MI None None None None None None None None None None None None
5 Battle Creek MI None None None None None None None None None None None None
6 Battle Creek MI None None None None None None None None None None None None
7 Greenville MI None None None None None None None None None None None None None
8 Battle Creek MI None None None None None None None None None None None None
9 Battle Creek MI None None None None None None None None None None None None
10 South Lancaster MA None None None None None None None None None None None None
11 Battle Creek MI None None None None None None None None None None None None
12 Battle Creek MI None None None None None None None None None None None None
13 Battle Creek MI None None None None None None None None None None None None
14 Oakland CA None None None None None None None None None None None None None
15 South Lancaster MA None None None None None None None None None None None None
16 Oakland CA None None None None None None None None None None None None None
17 Oakland CA None None None None None None None None None None None None None
18 Healdsburg CA None None None None None None None None None None None None None
19 Oakland CA None None None None None None None None None None None None None
20 South Lancaster MA None None None None None None None None None None None None
21 Oakland CA None None None None None None None None None None None None None
22 Oakland CA None None None None None None None None None None None None None
23 South Lancaster MA None None None None None None None None None None None None
24 Oakland CA None None None None None None None None None None None None None
25 Oakland CA None None None None None None None None None None None None None
26 Battle Creek MI None None None None None None None None None None None None
27 Battle Creek MI None None None None None None None None None None None None
28 Battle Creek MI None None None None None None None None None None None None
29 Battle Creek MI None None None None None None None None None None None None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13697 1302 Harding St. Nashville TN None None None None None None None None None
13698 1051 Mississippi Blvd. Memphis TN None None None None None None None None None
13699 736 Saffrans St. Memphis TN None None None None None None None None None
13700 Ashland City TN None None None None None None None None None None None None
13701 901 Buchanan St. Nashville TN None None None None None None None None None
13702 Leach TN None None None None None None None None None None None None None
13703 Route 1 Burns TN None None None None None None None None None None
13704 Route 3 Goodlettsville TN None None None None None None None None None None
13705 2310 Siefried St. Nashville TN None None None None None None None None None
13706 2119 Twenty-fourth Ave. North Nashville TN None None None None None None None
13712 Bon Aqua TN None None None None None None None None None None None None
13713 Jackson TN None None None None None None None None None None None None None
13714 Nashville TN None None None None None None None None None None None None None
13715 994 Mississippi Blvd. Memphis TN None None None None None None None None None
13716 322 Jackson Bldg. Nashville TN None None None None None None None None None
13717 322 Jackson Bldg. Nashville TN None None None None None None None None None
13718 322 Jackson Bldg. Nashville TN None None None None None None None None None
13719 2111 Evelyn Ave. Memphis TN None None None None None None None None None
13720 1900 Twenty-Fourth Ave. North Nashville TN None None None None None None None
13721 666 North Dunlap St. Memphis TN None None None None None None None None
13722 2315 Seifried St. Nashville TN None None None None None None None None None
13723 322 Jackson Bldg. Nashville TN None None None None None None None None None
13724 322 Jackson Bldg. Nashville TN None None None None None None None None None
13725 Twenty-fourth and New Bridge Road North Nashville TN None None None None None
13726 Paducah KY None None None None None None None None None None None None None
15017 Jamaica None None None None None None None None None None None None None None None
15019 Box 146 Glendale CA None None None None None None None None None None
15020 College View NE None None None None None None None None None None None None
15021 2718 Third Ave. South Minneapolis MN None None None None None None None
15022 College Place WA None None None None None None None None None None None None

6467 rows × 16 columns

In [24]:
places_split = df_to_list(df3)
In [25]:
list(set(places_split))[:50]
Out[25]:
['Gray',
 'Oroville',
 'Shreveport',
 'Sāo',
 'Alto',
 'Central',
 'Castlereagh',
 'Warrenton',
 'Alexandria',
 'Cannelton',
 'Hydro',
 'National',
 'Ladysmith',
 'Sylvan',
 'Vaughn',
 'Fish',
 'Watts',
 'Elgin',
 'Cheng',
 'Winslow',
 'Junction',
 'Zone',
 'Logan',
 'Edenville',
 'Watertown',
 'Lincoln',
 'Kilmarnock',
 'Whalan',
 'Corydon',
 'Danville',
 'Newark',
 'Fox',
 'Omer',
 'Springboro',
 'British',
 'Copenhagan',
 'Ave',
 'Walnut',
 'Trade',
 'Beach',
 'Asbury',
 'Fancher',
 'Rileyville',
 'Muscoda',
 'Moorestown',
 'Whipple',
 'Vaud',
 'Roaring',
 'Antonio',
 'Royal']
In [26]:
len(places_split)
Out[26]:
80237
In [27]:
with open("{}{}-SDA-place-names.txt".format(out_dir,str(datetime.date.today())), "w") as f:
    for place in list(set(places_split)):
        f.write("{}\n".format(place))
In [28]:
# %load shared_elements/system_info.py
import IPython
print (IPython.sys_info())
!pip freeze
{'commit_hash': '5c9c918',
 'commit_source': 'installation',
 'default_encoding': 'UTF-8',
 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython',
 'ipython_version': '5.1.0',
 'os_name': 'posix',
 'platform': 'Darwin-16.1.0-x86_64-i386-64bit',
 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python',
 'sys_platform': 'darwin',
 'sys_version': '3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, '
                '17:52:12) \n'
                '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'}
anaconda-client==1.5.5
appnope==0.1.0
argh==0.26.1
blinker==1.4
bokeh==0.12.3
boto==2.43.0
bz2file==0.98
chest==0.2.3
cloudpickle==0.2.1
clyent==1.2.2
dask==0.12.0
datashader==0.4.0
datashape==0.5.2
decorator==4.0.10
docutils==0.12
doit==0.29.0
gensim==0.12.4
Ghost.py==0.2.3
ghp-import2==1.0.1
gspread==0.4.1
HeapDict==1.0.0
httplib2==0.9.2
husl==4.0.3
ipykernel==4.5.2
ipython==5.1.0
ipython-genutils==0.1.0
ipywidgets==5.2.2
Jinja2==2.8
jsonschema==2.5.1
jupyter==1.0.0
jupyter-client==4.4.0
jupyter-console==5.0.0
jupyter-core==4.2.1
llvmlite==0.14.0
locket==0.2.0
Logbook==1.0.0
lxml==3.5.0
MacFSEvents==0.7
Mako==1.0.4
Markdown==2.6.7
MarkupSafe==0.23
mistune==0.7.3
multipledispatch==0.4.9
natsort==4.0.4
nb-anacondacloud==1.2.0
nb-conda==2.0.0
nb-conda-kernels==2.0.0
nb-config-manager==0.1.3
nbbrowserpdf==0.2.1
nbconvert==4.2.0
nbformat==4.2.0
nbpresent==3.0.2
networkx==1.11
Nikola==7.7.7
nltk==3.2.1
notebook==4.2.3
numba==0.29.0
numpy==1.11.2
oauth2client==4.0.0
odo==0.5.0
pandas==0.19.1
partd==0.3.6
path.py==0.0.0
pathtools==0.1.2
pexpect==4.0.1
pickleshare==0.7.4
Pillow==3.4.2
prompt-toolkit==1.0.9
ptyprocess==0.5.1
pyasn1==0.1.9
pyasn1-modules==0.0.8
pycrypto==2.6.1
Pygments==2.1.3
PyPDF2==1.25.1
PyRSS2Gen==1.1
python-dateutil==2.6.0
pytz==2016.10
PyYAML==3.12
pyzmq==16.0.2
qtconsole==4.2.1
requests==2.12.3
rsa==3.4.2
scipy==0.18.1
simplegeneric==0.8.1
six==1.10.0
smart-open==1.3.5
terminado==0.6
textblob==0.11.1
toolz==0.8.1
tornado==4.4.2
traitlets==4.3.1
Unidecode==0.4.19
watchdog==0.8.3
wcwidth==0.1.7
webassets==0.11.1
widgetsnbextension==1.2.6
ws4py==0.3.4
xarray==0.8.2
Yapsy==1.11.223
In [ ]: