Process-metadata-for-dfrbrowser
Metadata for browser:
- doi: url
- title: Expanded Name (i.e., Review and Herald (1.1) Dec. 12, 1870)
- authors: blank
- journal: expand name
- volume: number
- issue: number
- date: yyyy-mm-dd
- page: page
In [1]:
import csv
import datetime
from text2topics import utilities
import pandas as pd
import re
import urllib
In [2]:
base_url = "http://documents.adventistarchives.org/Periodicals/"
In [3]:
title_keys = {"ADV": "Training School Advocate",
"AmSn": "American Sentinel",
"ARAI": "Advent Review and Sabbath Herald Anniversary Issue",
"CE": "Christian Education",
"CUV": "Welcome Visitor (Columbia Union Visitor)",
"EDU": "Christian Educator",
"GCB": "General Conference Bulletin",
"GH": "Gospel Herald",
"GOH": "Gospel of Health",
"GS": "Gospel Sickle",
"HM": "Home Missionary",
"HR": "Health Reformer",
"IR": "Indiana Reporter",
"LB": "Life Boat",
"LH": "Life and Health",
"LibM": "Liberty",
"LUH": "Lake Union Herald",
"NMN": "North Michigan News Sheet",
"PHJ": "Pacific Health Journal and Temperance Advocate",
"PTAR": "Present Truth (Advent Review)",
"PUR": "Pacific Union Recorder",
"RH": "Review and Herald",
"Sligo": "Sligonian",
"SOL": "Sentinel of Liberty",
"ST": "Signs of the Times",
"SUW": "Report of Progress, Southern Union Conference",
"TCOG": "The Church Officer's Gazette",
"TMM": "The Missionary Magazine",
"WMH": "West Michigan Herald",
"YI": "Youth's Instructor"}
In [4]:
df = pd.read_table('/Users/jeriwieringa/Dissertation/models/data/target_300_10_Sample.txt',
header=None,
names=['doc_id', 'label', 'content'])
In [5]:
df
Out[5]:
In [6]:
def get_split_id(doc_id):
return doc_id.split('-')
def construct_url(base_url, abrev, split_id):
return urllib.parse.urljoin(base_url, "{}/{}.pdf".format(abrev, "-".join(split_id[:-1])) )
def get_date(split_id):
return re.search(r'[0-9]+', split_id[0]).group()
def get_volume(split_id):
return re.search(r'[0-9]+', split_id[1]).group()
def get_issue(split_id):
return split_id[2]
def get_page(split_id):
return re.search(r'[0-9]+', split_id[3]).group()
In [7]:
with open('/Users/jeriwieringa/Dissertation/browser/data/meta.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for index, row in df.iterrows():
_id = row['doc_id'].strip()
# print(row['doc_id'])
abrev = utilities.get_title(_id)
split_id = get_split_id(_id)
date_data = get_date(split_id)
url = construct_url(base_url, abrev, split_id)
try:
if len(date_data) == 4:
date = datetime.datetime.strptime(get_date(split_id), "%Y").date()
elif len(date_data) < 7:
date = datetime.datetime.strptime(get_date(split_id), "%Y%m").date()
else:
date = datetime.datetime.strptime(get_date(split_id), "%Y%m%d").date()
except:
date = datetime.datetime.strptime(get_date(split_id)[:6], "%Y%m").date()
try:
volume = get_volume(split_id)
except:
volume = "XX"
try:
issue = get_issue(split_id)
except:
issue = "XX"
page = get_page(split_id)
journal = title_keys[abrev]
title = "{} (Vol. {}.{}) {}, page {}".format(journal, volume, issue, date.strftime('%b %d, %Y'), page)
writer.writerow([_id, title, "NA", journal, volume, issue, date, page, url, abrev])