The output of this notebook is eight zip files that represent four configurations of the corpus to test the stability of the mallet model as well as the value added by controlling the quality of the documents used to create the models (per [cite]).

Variations are:

Control - Random sample of documents
Target - Includes only documents with at least 300 tokens and an error rate under 10%
Test1 - Includes documents with at least 300 tokens, but ignores error rate
Test2 - Includes documents with at least 300 tokens and an error rate under 20%

%load_ext autoreload
%autoreload 2

from text2topics import utilities
import math
import numpy as np
import os
import pandas as pd
import tarfile

fullCorpus = "/Users/jeriwieringa/Dissertation/text/text/2017-04-Final-Corpus.tar.gz"
statsDir = "/Users/jeriwieringa/Dissertation/drafts/data/module-3/2017-05-corpus-stats/"
corpusDir = "/Users/jeriwieringa/Dissertation/text/text/2018-02-CorpusSubSets/"

df = pd.read_csv(os.path.join(statsDir, '2017-05-Composite-OCR-statistics.csv'))

df

fullCorpusObject = tarfile.open(fullCorpus)

Quick sanity check to makes sure the lists match.

tarPathNames = fullCorpusObject.getnames()[1:]

tarFileNames = []
for path in tarPathNames:
    tarFileNames.append(os.path.basename(path))

statsFileNames = df['doc_id'].tolist()

# print(statsFileNames[:10])
# print(tarFileNames[:10])
print (tarPathNames[:10])

['2017-04-Final-Corpus/ADV18981201-V02-01-page1.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page10.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page11.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page12.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page13.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page14.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page15.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page16.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page17.txt', '2017-04-Final-Corpus/ADV18981201-V02-01-page18.txt']

len(list(set(tarFileNames)-set(statsFileNames)))

0

Create Random Sample¶

sampleSize = math.ceil(.4*len(statsFileNames))
# print(sampleSize)

randomSample = np.random.choice(statsFileNames, sampleSize, replace=False).tolist()
# print(randomSample[:10])

#https://stackoverflow.com/questions/17616340/add-files-from-one-tar-into-another-tar-in-python

# randomSampleTar = tarfile.open(os.path.join(corpusDir, 'randomSample.tar.gz'), 'w:gz')
# randomHoldoutTar = tarfile.open(os.path.join(corpusDir, 'randomHoldout.tar.gz'), 'w:gz')

# for member in fullCorpusObject.getmembers()[1:]:
#     if os.path.basename(member.name) in randomSample:
#         randomSampleTar.addfile(member, fullCorpusObject.extractfile(member))
#     else:
#         randomHoldoutTar.addfile(member, fullCorpusObject.extractfile(member))

# randomSampleTar.close()
# randomHoldoutTar.close()

# Abstracted function to library. Use this version if run second time.

# utilities.create_tar_files(corpusDir, 'random', fullCorpusObject, randomSample)

Create Target Subset¶

target_df = df[(df['num_tokens'] >= 300) & (df['error_rate'] < 0.1)]

len(target_df)

180844

# utilities.create_tar_files(corpusDir, 'target_300_10_', fullCorpusObject, target_df['doc_id'].tolist())

Create Test Set¶

For this test, I filtered only by min number of tokens

test_df = df[df['num_tokens'] >= 300]

targetList = test_df['doc_id'].tolist()

utilities.create_tar_files(corpusDir, 'test_300_noMax_', fullCorpusObject, targetList)

Create Second Test Set¶

For the second test, I filtered by a 25% error rate

test_df2 = df[(df['num_tokens'] >= 300) & (df['error_rate'] < 0.25)]

testList = test_df2['doc_id'].tolist()

utilities.create_tar_files(corpusDir, 'test_300_25_', fullCorpusObject, testList)

	doc_id	error_rate	num_tokens	num_errors	year	title
0	ADV18981201-V02-01-page1.txt	0.157	51	8	1898	ADV
1	ADV18981201-V02-01-page10.txt	0.021	240	5	1898	ADV
2	ADV18981201-V02-01-page11.txt	0.011	282	3	1898	ADV
3	ADV18981201-V02-01-page12.txt	0.016	315	5	1898	ADV
4	ADV18981201-V02-01-page13.txt	0.031	353	11	1898	ADV
5	ADV18981201-V02-01-page14.txt	0.011	275	3	1898	ADV
6	ADV18981201-V02-01-page15.txt	0.019	308	6	1898	ADV
7	ADV18981201-V02-01-page16.txt	0.025	316	8	1898	ADV
8	ADV18981201-V02-01-page17.txt	0.103	223	23	1898	ADV
9	ADV18981201-V02-01-page18.txt	0.015	264	4	1898	ADV
10	ADV18981201-V02-01-page19.txt	0.040	277	11	1898	ADV
11	ADV18981201-V02-01-page2.txt	0.008	240	2	1898	ADV
12	ADV18981201-V02-01-page20.txt	0.013	315	4	1898	ADV
13	ADV18981201-V02-01-page21.txt	0.019	261	5	1898	ADV
14	ADV18981201-V02-01-page22.txt	0.010	286	3	1898	ADV
15	ADV18981201-V02-01-page23.txt	0.011	261	3	1898	ADV
16	ADV18981201-V02-01-page24.txt	0.016	254	4	1898	ADV
17	ADV18981201-V02-01-page25.txt	0.035	254	9	1898	ADV
18	ADV18981201-V02-01-page26.txt	0.202	129	26	1898	ADV
19	ADV18981201-V02-01-page3.txt	0.017	242	4	1898	ADV
20	ADV18981201-V02-01-page4.txt	0.008	245	2	1898	ADV
21	ADV18981201-V02-01-page5.txt	0.027	261	7	1898	ADV
22	ADV18981201-V02-01-page6.txt	0.024	253	6	1898	ADV
23	ADV18981201-V02-01-page7.txt	0.008	252	2	1898	ADV
24	ADV18981201-V02-01-page8.txt	0.017	237	4	1898	ADV
25	ADV18981201-V02-01-page9.txt	0.077	195	15	1898	ADV
26	ADV18990101-V01-01-page1.txt	0.750	4	3	1899	ADV
27	ADV18990101-V01-01-page10.txt	0.022	224	5	1899	ADV
28	ADV18990101-V01-01-page11.txt	0.004	225	1	1899	ADV
29	ADV18990101-V01-01-page12.txt	0.013	232	3	1899	ADV
...	...	...	...	...	...	...
197913	YI19200106-V68-01-page9.txt	0.038	846	32	1920	YI
197914	YI19200113-V68-02-page1.txt	0.279	61	17	1920	YI
197915	YI19200113-V68-02-page10.txt	0.028	1193	34	1920	YI
197916	YI19200113-V68-02-page11.txt	0.013	927	12	1920	YI
197917	YI19200113-V68-02-page12.txt	0.016	1359	22	1920	YI
197918	YI19200113-V68-02-page13.txt	0.015	1572	23	1920	YI
197919	YI19200113-V68-02-page14.txt	0.011	1033	11	1920	YI
197920	YI19200113-V68-02-page2.txt	0.007	1121	8	1920	YI
197921	YI19200113-V68-02-page3.txt	0.006	1035	6	1920	YI
197922	YI19200113-V68-02-page4.txt	0.008	995	8	1920	YI
197923	YI19200113-V68-02-page5.txt	0.008	1197	10	1920	YI
197924	YI19200113-V68-02-page6.txt	0.003	1152	3	1920	YI
197925	YI19200113-V68-02-page7.txt	0.013	1151	15	1920	YI
197926	YI19200113-V68-02-page8.txt	0.011	1110	12	1920	YI
197927	YI19200113-V68-02-page9.txt	0.011	847	9	1920	YI
197928	YI19200120-V68-03-page1.txt	0.000	32	0	1920	YI
197929	YI19200120-V68-03-page10.txt	0.013	843	11	1920	YI
197930	YI19200120-V68-03-page11.txt	0.013	1190	16	1920	YI
197931	YI19200120-V68-03-page12.txt	0.011	995	11	1920	YI
197932	YI19200120-V68-03-page13.txt	0.031	1276	39	1920	YI
197933	YI19200120-V68-03-page14.txt	0.013	1310	17	1920	YI
197934	YI19200120-V68-03-page15.txt	NaN	0	0	1920	YI
197935	YI19200120-V68-03-page2.txt	0.020	1155	23	1920	YI
197936	YI19200120-V68-03-page3.txt	0.008	951	8	1920	YI
197937	YI19200120-V68-03-page4.txt	0.008	1226	10	1920	YI
197938	YI19200120-V68-03-page5.txt	0.004	1158	5	1920	YI
197939	YI19200120-V68-03-page6.txt	0.020	1087	22	1920	YI
197940	YI19200120-V68-03-page7.txt	0.036	855	31	1920	YI
197941	YI19200120-V68-03-page8.txt	0.005	1132	6	1920	YI
197942	YI19200120-V68-03-page9.txt	0.019	728	14	1920	YI