create-SDA-people-and-places-lists
In [1]:
import gspread
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials
import datetime
import numpy as np
In [2]:
scope = ['https://spreadsheets.google.com/feeds']
In [3]:
credentials = ServiceAccountCredentials.from_json_keyfile_name('secrets/dissertation.json', scope)
In [4]:
gc = gspread.authorize(credentials)
In [5]:
sheets = ['YB1883', 'YB1885', 'YB1890', 'YB1894',
'YB1905', 'YB1910', 'YB1915', 'YB1920']
In [6]:
def read_in_sheet(sheet):
dts = gc.open(sheet).sheet1
frame = pd.DataFrame(dts.get_all_records())
return(frame)
In [7]:
df = pd.DataFrame()
In [8]:
for sheet in sheets:
frame = read_in_sheet(sheet)
df = df.append(frame, ignore_index=True)
In [9]:
df
Out[9]:
conference | gender | group | institution-name | lastname | location | name | organization | page | position | position-information | prefix | region | suffix | yearbook-year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | General | m | board of trustees | Educational Society | Butler | Battle Creek, MI | Geo. I. | 7 | trustee | Michigan | 1883 | ||||
1 | General | m | board of trustees | Educational Society | Haskell | South Lancaster, MA | S.N. | 7 | trustee | Michigan | 1883 | ||||
2 | General | m | board of trustees | Educational Society | Henry | Battle Creek, MI | A.R. | 7 | trustee | Michigan | 1883 | ||||
3 | General | m | board of trustees | Educational Society | Kellogg | Battle Creek, MI | J.H. | 7 | trustee | Michigan | M.D. | 1883 | |||
4 | General | m | board of trustees | Educational Society | Kellogg | Battle Creek, MI | H.W. | 7 | trustee | Michigan | 1883 | ||||
5 | General | m | board of trustees | Educational Society | Oyen | Battle Creek, MI | A.B. | 7 | trustee | Michigan | 1883 | ||||
6 | General | m | board of trustees | Educational Society | Sisley | Battle Creek, MI | W.C. | 7 | trustee | Michigan | 1883 | ||||
7 | General | m | board of directors | Health Reform Institute | Fargo | Greenville, MI | J. | 7 | director | Michigan | 1883 | ||||
8 | General | m | board of directors | Health Reform Institute | Hall | Battle Creek, MI | W.H. | 7 | director | Michigan | 1883 | ||||
9 | General | m | board of directors | Health Reform Institute | Hall | Battle Creek, MI | L.M. | 7 | director | Michigan | 1883 | ||||
10 | General | m | board of directors | Health Reform Institute | Haskell | South Lancaster, MA | S.N. | 7 | director | Michigan | 1883 | ||||
11 | General | m | board of directors | Health Reform Institute | Henry | Battle Creek, MI | A.R. | 7 | director | Michigan | 1883 | ||||
12 | General | m | board of directors | Health Reform Institute | Kellogg | Battle Creek, MI | J.H. | 7 | director | Michigan | M.D. | 1883 | |||
13 | General | m | board of directors | Health Reform Institute | Murphy | Battle Creek, MI | G.H. | 7 | director | Michigan | 1883 | ||||
14 | General | m | Pacific Seventh-day Adventist Publishing Assoc... | Glenn | Oakland, CA | W.N. | 7 | auditor | 1883 | ||||||
15 | General | m | director | Pacific Seventh-day Adventist Publishing Assoc... | Haskell | South Lancaster, MA | S.N. | 7 | director | 1883 | |||||
16 | General | m | director | Pacific Seventh-day Adventist Publishing Assoc... | Israel | Oakland, CA | M.C. | 7 | director | 1883 | |||||
17 | General | m | director | Pacific Seventh-day Adventist Publishing Assoc... | Jones | Oakland, CA | C.H. | 7 | director | 1883 | |||||
18 | General | m | director | Pacific Seventh-day Adventist Publishing Assoc... | Morrison | Healdsburg, CA | J. | 7 | director | 1883 | |||||
19 | General | m | director | Pacific Seventh-day Adventist Publishing Assoc... | White | Oakland, CA | W.C. | 7 | director | 1883 | |||||
20 | General | m | publishing committee | Pacific Seventh-day Adventist Publishing Assoc... | Haskell | South Lancaster, MA | S.N. | 7 | committee member | 1883 | |||||
21 | General | m | publishing committee | Pacific Seventh-day Adventist Publishing Assoc... | Waggoner | Oakland, CA | J.H. | 7 | committee member | 1883 | |||||
22 | General | m | publishing committee | Pacific Seventh-day Adventist Publishing Assoc... | White | Oakland, CA | W.C. | 7 | committee member | 1883 | |||||
23 | General | m | Pacific Seventh-day Adventist Publishing Assoc... | Haskell | South Lancaster, MA | S.N. | 7 | president | 1883 | ||||||
24 | General | m | Pacific Seventh-day Adventist Publishing Assoc... | Vickery | Oakland, CA | W.K. | 7 | secretary | 1883 | ||||||
25 | General | m | Pacific Seventh-day Adventist Publishing Assoc... | White | Oakland, CA | W.C. | 7 | vice president | 1883 | ||||||
26 | General | m | Seventh-day Adventist Publishing Association | Sisley | Battle Creek, MI | W.C. | 6 | auditor | Michigan | 1883 | |||||
27 | General | m | publishing committee | Seventh-day Adventist Publishing Association | Amadon | Battle Creek, MI | G.W. | 6 | committee member | Michigan | 1883 | ||||
28 | General | m | publishing committee | Seventh-day Adventist Publishing Association | Butler | Battle Creek, MI | Geo. I. | 6 | committee member | Michigan | 1883 | ||||
29 | General | m | publishing committee | Seventh-day Adventist Publishing Association | Smith | Battle Creek, MI | U. | 6 | committee member | Michigan | 1883 | ||||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
15053 | board of trustees | Loma Linda Sanitarium | Martin | W.F. | 265, 227 | trustee | California | 1920 | |||||||
15054 | board of trustees | Loma Linda Sanitarium | Reaser | G.W. | 265, 227 | trustee | California | 1920 | |||||||
15055 | board of trustees | Loma Linda Sanitarium | Risley | E.H. | 265, 227 | trustee | California | 1920 | |||||||
15056 | board of trustees | Loma Linda Sanitarium | Bowen | L.M. | 265, 227 | trustee | California | 1920 | |||||||
15057 | board of trustees | Loma Linda Sanitarium | McElhany | J.L. | 265, 227 | trustee | California | 1920 | |||||||
15058 | board of trustees | Loma Linda Sanitarium | Evans | Newton | 265, 227 | trustee | California | 1920 | |||||||
15059 | board of trustees | Loma Linda Sanitarium | Vollmer | H.W. | 265, 227 | trustee | California | 1920 | |||||||
15060 | board of trustees | Loma Linda Sanitarium | Magan | P.T. | 265, 227 | trustee | California | 1920 | |||||||
15061 | board of trustees | Loma Linda Sanitarium | Weir | J.J. | 265, 227 | trustee | California | 1920 | |||||||
15062 | board of trustees | Loma Linda Sanitarium | Christian | J.W. | 265, 227 | trustee | California | 1920 | |||||||
15063 | officers of the board | Loma Linda Sanitarium | Evans | Newton | 265, 227 | vice president | California | 1920 | |||||||
15064 | officers | St. Helena Sanitarium | Rice | Helen N. | California Medical Missionary and Benevolent A... | 267-268 | lady head nurse | Miss | California | R.N. | 1920 | ||||
15065 | officers | St. Helena Sanitarium | Rice | Helen N. | California Medical Missionary and Benevolent A... | 267-268 | secretary | training-school | Miss | California | R.N. | 1920 | |||
15066 | Columbia Union | executive committee | Booth | A.S. | 39, 40 | committee member | 1920 | ||||||||
15067 | Columbia Union | executive committee | Harter | R.E. | 39, 41 | committee member | 1920 | ||||||||
15068 | Columbia Union | executive committee | Parsons | D.A. | 39, 42 | committee member | 1920 | ||||||||
15069 | Columbia Union | executive committee | Moffet | W.C. | 39, 44 | committee member | 1920 | ||||||||
15070 | Columbia Union | executive committee | Leland | J.A. | 39, 47 | committee member | 1920 | ||||||||
15071 | Columbia Union | executive committee | Lindsay | R.S. | 39, 48 | committee member | 1920 | ||||||||
15072 | Columbia Union | executive committee | Westbrook | T.B. | 39, 49 | committee member | 1920 | ||||||||
15073 | Columbia Union | executive committee | Shaw | Fred | 48-49 | committee member | West Pennsylvania | 1920 | |||||||
15074 | Lake Union | executive committee | Clark | A.J. | 54, 55 | committee member | 1920 | ||||||||
15075 | Lake Union | executive committee | Westworth | Wm. A. | 54, 57 | committee member | 1920 | ||||||||
15076 | Lake Union | executive committee | Holden | W.H. | 54, 59 | committee member | 1920 | ||||||||
15077 | Lake Union | executive committee | Wiest | C.S. | 54, 60 | committee member | 1920 | ||||||||
15078 | Lake Union | executive committee | Nethery | R.J. | 54, 62 | committee member | 1920 | ||||||||
15079 | Lake Union | executive committee | Irwin | J.J. | 54, 63 | committee member | 1920 | ||||||||
15080 | Lake Union | executive committee | Bristol | E.A. | 54, 64 | committee member | 1920 | ||||||||
15081 | Lake Union | executive committee | Piper | J.F. | 54, 66 | committee member | 1920 | ||||||||
15082 | Pacific Union | executive committee | Adams | W.M. | 95-96 | committee member | Southern California | 1920 |
15083 rows × 15 columns
In [10]:
last_names = df['lastname']
In [11]:
last_names_df = last_names.str.rsplit(' ', expand=True)
In [12]:
last_names_df
Out[12]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Butler | None | None | None |
1 | Haskell | None | None | None |
2 | Henry | None | None | None |
3 | Kellogg | None | None | None |
4 | Kellogg | None | None | None |
5 | Oyen | None | None | None |
6 | Sisley | None | None | None |
7 | Fargo | None | None | None |
8 | Hall | None | None | None |
9 | Hall | None | None | None |
10 | Haskell | None | None | None |
11 | Henry | None | None | None |
12 | Kellogg | None | None | None |
13 | Murphy | None | None | None |
14 | Glenn | None | None | None |
15 | Haskell | None | None | None |
16 | Israel | None | None | None |
17 | Jones | None | None | None |
18 | Morrison | None | None | None |
19 | White | None | None | None |
20 | Haskell | None | None | None |
21 | Waggoner | None | None | None |
22 | White | None | None | None |
23 | Haskell | None | None | None |
24 | Vickery | None | None | None |
25 | White | None | None | None |
26 | Sisley | None | None | None |
27 | Amadon | None | None | None |
28 | Butler | None | None | None |
29 | Smith | None | None | None |
... | ... | ... | ... | ... |
15053 | Martin | None | None | None |
15054 | Reaser | None | None | None |
15055 | Risley | None | None | None |
15056 | Bowen | None | None | None |
15057 | McElhany | None | None | None |
15058 | Evans | None | None | None |
15059 | Vollmer | None | None | None |
15060 | Magan | None | None | None |
15061 | Weir | None | None | None |
15062 | Christian | None | None | None |
15063 | Evans | None | None | None |
15064 | Rice | None | None | None |
15065 | Rice | None | None | None |
15066 | Booth | None | None | None |
15067 | Harter | None | None | None |
15068 | Parsons | None | None | None |
15069 | Moffet | None | None | None |
15070 | Leland | None | None | None |
15071 | Lindsay | None | None | None |
15072 | Westbrook | None | None | None |
15073 | Shaw | None | None | None |
15074 | Clark | None | None | None |
15075 | Westworth | None | None | None |
15076 | Holden | None | None | None |
15077 | Wiest | None | None | None |
15078 | Nethery | None | None | None |
15079 | Irwin | None | None | None |
15080 | Bristol | None | None | None |
15081 | Piper | None | None | None |
15082 | Adams | None | None | None |
15083 rows × 4 columns
In [13]:
def df_to_list(df):
headers = list(df.columns.values)
list_out = []
for header in headers:
df[header].str.strip()
col = df[header].tolist()
for each in col:
if str(each).isalpha() and len(str(each)) > 2:
list_out.append(each)
else:
pass
return(list_out)
In [14]:
last_names_split = df_to_list(last_names_df)
In [15]:
out_dir = '/Users/jeriwieringa/Dissertation/drafts/data/word-lists/'
In [16]:
with open("{}{}-SDA-last-names.txt".format(out_dir,str(datetime.date.today())), "w") as f:
for name in last_names_split:
f.write("{}\n".format(name))
In [17]:
df2 = df
In [18]:
def drop_empty_rows(df, header):
df[header].replace('', np.nan, inplace=True)
df.dropna(subset=[header], inplace=True)
return(df)
In [19]:
df2 = drop_empty_rows(df2, 'location')
In [20]:
places = df2['location']
In [21]:
places = places.str.replace(' ', ',')
In [22]:
df3 = places.str.split(',', expand=True)
In [23]:
df3
Out[23]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
1 | South | Lancaster | MA | None | None | None | None | None | None | None | None | None | None | None | None | |
2 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
3 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
4 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
5 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
6 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
7 | Greenville | MI | None | None | None | None | None | None | None | None | None | None | None | None | None | |
8 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
9 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
10 | South | Lancaster | MA | None | None | None | None | None | None | None | None | None | None | None | None | |
11 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
12 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
13 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
14 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
15 | South | Lancaster | MA | None | None | None | None | None | None | None | None | None | None | None | None | |
16 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
17 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
18 | Healdsburg | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
19 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
20 | South | Lancaster | MA | None | None | None | None | None | None | None | None | None | None | None | None | |
21 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
22 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
23 | South | Lancaster | MA | None | None | None | None | None | None | None | None | None | None | None | None | |
24 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
25 | Oakland | CA | None | None | None | None | None | None | None | None | None | None | None | None | None | |
26 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
27 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
28 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
29 | Battle | Creek | MI | None | None | None | None | None | None | None | None | None | None | None | None | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13697 | 1302 | Harding | St. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13698 | 1051 | Mississippi | Blvd. | Memphis | TN | None | None | None | None | None | None | None | None | None | ||
13699 | 736 | Saffrans | St. | Memphis | TN | None | None | None | None | None | None | None | None | None | ||
13700 | Ashland | City | TN | None | None | None | None | None | None | None | None | None | None | None | None | |
13701 | 901 | Buchanan | St. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13702 | Leach | TN | None | None | None | None | None | None | None | None | None | None | None | None | None | |
13703 | Route | 1 | Burns | TN | None | None | None | None | None | None | None | None | None | None | ||
13704 | Route | 3 | Goodlettsville | TN | None | None | None | None | None | None | None | None | None | None | ||
13705 | 2310 | Siefried | St. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13706 | 2119 | Twenty-fourth | Ave. | North | Nashville | TN | None | None | None | None | None | None | None | |||
13712 | Bon | Aqua | TN | None | None | None | None | None | None | None | None | None | None | None | None | |
13713 | Jackson | TN | None | None | None | None | None | None | None | None | None | None | None | None | None | |
13714 | Nashville | TN | None | None | None | None | None | None | None | None | None | None | None | None | None | |
13715 | 994 | Mississippi | Blvd. | Memphis | TN | None | None | None | None | None | None | None | None | None | ||
13716 | 322 | Jackson | Bldg. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13717 | 322 | Jackson | Bldg. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13718 | 322 | Jackson | Bldg. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13719 | 2111 | Evelyn | Ave. | Memphis | TN | None | None | None | None | None | None | None | None | None | ||
13720 | 1900 | Twenty-Fourth | Ave. | North | Nashville | TN | None | None | None | None | None | None | None | |||
13721 | 666 | North | Dunlap | St. | Memphis | TN | None | None | None | None | None | None | None | None | ||
13722 | 2315 | Seifried | St. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13723 | 322 | Jackson | Bldg. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13724 | 322 | Jackson | Bldg. | Nashville | TN | None | None | None | None | None | None | None | None | None | ||
13725 | Twenty-fourth | and | New | Bridge | Road | North | Nashville | TN | None | None | None | None | None | |||
13726 | Paducah | KY | None | None | None | None | None | None | None | None | None | None | None | None | None | |
15017 | Jamaica | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
15019 | Box | 146 | Glendale | CA | None | None | None | None | None | None | None | None | None | None | ||
15020 | College | View | NE | None | None | None | None | None | None | None | None | None | None | None | None | |
15021 | 2718 | Third | Ave. | South | Minneapolis | MN | None | None | None | None | None | None | None | |||
15022 | College | Place | WA | None | None | None | None | None | None | None | None | None | None | None | None |
6467 rows × 16 columns
In [24]:
places_split = df_to_list(df3)
In [25]:
list(set(places_split))[:50]
Out[25]:
['Gray', 'Oroville', 'Shreveport', 'Sāo', 'Alto', 'Central', 'Castlereagh', 'Warrenton', 'Alexandria', 'Cannelton', 'Hydro', 'National', 'Ladysmith', 'Sylvan', 'Vaughn', 'Fish', 'Watts', 'Elgin', 'Cheng', 'Winslow', 'Junction', 'Zone', 'Logan', 'Edenville', 'Watertown', 'Lincoln', 'Kilmarnock', 'Whalan', 'Corydon', 'Danville', 'Newark', 'Fox', 'Omer', 'Springboro', 'British', 'Copenhagan', 'Ave', 'Walnut', 'Trade', 'Beach', 'Asbury', 'Fancher', 'Rileyville', 'Muscoda', 'Moorestown', 'Whipple', 'Vaud', 'Roaring', 'Antonio', 'Royal']
In [26]:
len(places_split)
Out[26]:
80237
In [27]:
with open("{}{}-SDA-place-names.txt".format(out_dir,str(datetime.date.today())), "w") as f:
for place in list(set(places_split)):
f.write("{}\n".format(place))
In [28]:
# %load shared_elements/system_info.py
import IPython
print (IPython.sys_info())
!pip freeze
{'commit_hash': '5c9c918', 'commit_source': 'installation', 'default_encoding': 'UTF-8', 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython', 'ipython_version': '5.1.0', 'os_name': 'posix', 'platform': 'Darwin-16.1.0-x86_64-i386-64bit', 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python', 'sys_platform': 'darwin', 'sys_version': '3.5.2 |Continuum Analytics, Inc.| (default, Jul 2 2016, ' '17:52:12) \n' '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'} anaconda-client==1.5.5 appnope==0.1.0 argh==0.26.1 blinker==1.4 bokeh==0.12.3 boto==2.43.0 bz2file==0.98 chest==0.2.3 cloudpickle==0.2.1 clyent==1.2.2 dask==0.12.0 datashader==0.4.0 datashape==0.5.2 decorator==4.0.10 docutils==0.12 doit==0.29.0 gensim==0.12.4 Ghost.py==0.2.3 ghp-import2==1.0.1 gspread==0.4.1 HeapDict==1.0.0 httplib2==0.9.2 husl==4.0.3 ipykernel==4.5.2 ipython==5.1.0 ipython-genutils==0.1.0 ipywidgets==5.2.2 Jinja2==2.8 jsonschema==2.5.1 jupyter==1.0.0 jupyter-client==4.4.0 jupyter-console==5.0.0 jupyter-core==4.2.1 llvmlite==0.14.0 locket==0.2.0 Logbook==1.0.0 lxml==3.5.0 MacFSEvents==0.7 Mako==1.0.4 Markdown==2.6.7 MarkupSafe==0.23 mistune==0.7.3 multipledispatch==0.4.9 natsort==4.0.4 nb-anacondacloud==1.2.0 nb-conda==2.0.0 nb-conda-kernels==2.0.0 nb-config-manager==0.1.3 nbbrowserpdf==0.2.1 nbconvert==4.2.0 nbformat==4.2.0 nbpresent==3.0.2 networkx==1.11 Nikola==7.7.7 nltk==3.2.1 notebook==4.2.3 numba==0.29.0 numpy==1.11.2 oauth2client==4.0.0 odo==0.5.0 pandas==0.19.1 partd==0.3.6 path.py==0.0.0 pathtools==0.1.2 pexpect==4.0.1 pickleshare==0.7.4 Pillow==3.4.2 prompt-toolkit==1.0.9 ptyprocess==0.5.1 pyasn1==0.1.9 pyasn1-modules==0.0.8 pycrypto==2.6.1 Pygments==2.1.3 PyPDF2==1.25.1 PyRSS2Gen==1.1 python-dateutil==2.6.0 pytz==2016.10 PyYAML==3.12 pyzmq==16.0.2 qtconsole==4.2.1 requests==2.12.3 rsa==3.4.2 scipy==0.18.1 simplegeneric==0.8.1 six==1.10.0 smart-open==1.3.5 terminado==0.6 textblob==0.11.1 toolz==0.8.1 tornado==4.4.2 traitlets==4.3.1 Unidecode==0.4.19 watchdog==0.8.3 wcwidth==0.1.7 webassets==0.11.1 widgetsnbextension==1.2.6 ws4py==0.3.4 xarray==0.8.2 Yapsy==1.11.223
In [ ]: