{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-03-25T05:48:22.449308Z", "start_time": "2018-03-25T05:48:21.737040Z" }, "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-03-25T05:48:22.456431Z", "start_time": "2018-03-25T05:48:22.451770Z" }, "collapsed": true }, "outputs": [], "source": [ "dataDir = \"../../data/\"\n", "files = [\"target_300_10.Holdout.18497.docTopics.txt\", \"target_300_10.18497.docTopics.txt\", \"TopicLabels.csv\"]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2018-03-25T05:48:40.520669Z", "start_time": "2018-03-25T05:48:22.724557Z" }, "collapsed": true }, "outputs": [], "source": [ "dt = pd.read_table(os.path.join(dataDir, files[1]), sep='\\t', header=None)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2018-03-25T05:48:42.281243Z", "start_time": "2018-03-25T05:48:40.527005Z" }, "collapsed": true }, "outputs": [], "source": [ "dt_holdouts = pd.read_table(os.path.join(dataDir, files[0]), sep='\\t', skiprows=1, header=None)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2018-03-25T05:48:42.640836Z", "start_time": "2018-03-25T05:48:42.283426Z" }, "collapsed": true }, "outputs": [], "source": [ "dt = dt.append(dt_holdouts)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2018-03-25T05:48:42.701074Z", "start_time": "2018-03-25T05:48:42.643652Z" }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "... | \n", "242 | \n", "243 | \n", "244 | \n", "245 | \n", "246 | \n", "247 | \n", "248 | \n", "249 | \n", "250 | \n", "251 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "ADV18981201-V02-01-page12.txt | \n", "0.000247 | \n", "0.000077 | \n", "0.000303 | \n", "0.000252 | \n", "0.000168 | \n", "0.000858 | \n", "0.000106 | \n", "0.000355 | \n", "... | \n", "0.000093 | \n", "0.000146 | \n", "0.000120 | \n", "0.000251 | \n", "0.000099 | \n", "0.008392 | \n", "0.000134 | \n", "0.000146 | \n", "0.000133 | \n", "0.000091 | \n", "
1 | \n", "1 | \n", "ADV18981201-V02-01-page13.txt | \n", "0.000279 | \n", "0.000087 | \n", "0.000343 | \n", "0.000285 | \n", "0.037680 | \n", "0.010343 | \n", "0.000120 | \n", "0.000402 | \n", "... | \n", "0.000106 | \n", "0.000165 | \n", "0.000135 | \n", "0.000284 | \n", "0.000112 | \n", "0.000121 | \n", "0.000151 | \n", "0.000165 | \n", "0.000150 | \n", "0.000103 | \n", "
2 | \n", "2 | \n", "ADV18981201-V02-01-page15.txt | \n", "0.000265 | \n", "0.000082 | \n", "0.000325 | \n", "0.000269 | \n", "0.000180 | \n", "0.009792 | \n", "0.000113 | \n", "0.000381 | \n", "... | \n", "0.000100 | \n", "0.000157 | \n", "0.000128 | \n", "0.000269 | \n", "0.000106 | \n", "0.000115 | \n", "0.000143 | \n", "0.000157 | \n", "0.000142 | \n", "0.000098 | \n", "
3 | \n", "3 | \n", "ADV18981201-V02-01-page16.txt | \n", "0.000293 | \n", "0.000091 | \n", "0.000360 | \n", "0.000299 | \n", "0.000200 | \n", "0.001018 | \n", "0.000125 | \n", "0.000422 | \n", "... | \n", "0.000111 | \n", "0.000173 | \n", "0.000142 | \n", "0.000298 | \n", "0.000118 | \n", "0.000127 | \n", "0.000159 | \n", "0.000174 | \n", "0.000158 | \n", "0.000109 | \n", "
4 | \n", "4 | \n", "ADV18981201-V02-01-page20.txt | \n", "0.000237 | \n", "0.000074 | \n", "0.055981 | \n", "0.000242 | \n", "0.000162 | \n", "0.000824 | \n", "0.000101 | \n", "0.000341 | \n", "... | \n", "0.000090 | \n", "0.000140 | \n", "0.000115 | \n", "0.008196 | \n", "0.000095 | \n", "0.000103 | \n", "0.000128 | \n", "0.000140 | \n", "0.000128 | \n", "0.000088 | \n", "
5 rows × 252 columns
\n", "\n", " | doc_id | \n", "filename | \n", "0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "... | \n", "240 | \n", "241 | \n", "242 | \n", "243 | \n", "244 | \n", "245 | \n", "246 | \n", "247 | \n", "248 | \n", "249 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "ADV18981201-V02-01-page12.txt | \n", "0.000247 | \n", "0.000077 | \n", "0.000303 | \n", "0.000252 | \n", "0.000168 | \n", "0.000858 | \n", "0.000106 | \n", "0.000355 | \n", "... | \n", "0.000093 | \n", "0.000146 | \n", "0.000120 | \n", "0.000251 | \n", "0.000099 | \n", "0.008392 | \n", "0.000134 | \n", "0.000146 | \n", "0.000133 | \n", "0.000091 | \n", "
1 | \n", "1 | \n", "ADV18981201-V02-01-page13.txt | \n", "0.000279 | \n", "0.000087 | \n", "0.000343 | \n", "0.000285 | \n", "0.037680 | \n", "0.010343 | \n", "0.000120 | \n", "0.000402 | \n", "... | \n", "0.000106 | \n", "0.000165 | \n", "0.000135 | \n", "0.000284 | \n", "0.000112 | \n", "0.000121 | \n", "0.000151 | \n", "0.000165 | \n", "0.000150 | \n", "0.000103 | \n", "
2 | \n", "2 | \n", "ADV18981201-V02-01-page15.txt | \n", "0.000265 | \n", "0.000082 | \n", "0.000325 | \n", "0.000269 | \n", "0.000180 | \n", "0.009792 | \n", "0.000113 | \n", "0.000381 | \n", "... | \n", "0.000100 | \n", "0.000157 | \n", "0.000128 | \n", "0.000269 | \n", "0.000106 | \n", "0.000115 | \n", "0.000143 | \n", "0.000157 | \n", "0.000142 | \n", "0.000098 | \n", "
3 | \n", "3 | \n", "ADV18981201-V02-01-page16.txt | \n", "0.000293 | \n", "0.000091 | \n", "0.000360 | \n", "0.000299 | \n", "0.000200 | \n", "0.001018 | \n", "0.000125 | \n", "0.000422 | \n", "... | \n", "0.000111 | \n", "0.000173 | \n", "0.000142 | \n", "0.000298 | \n", "0.000118 | \n", "0.000127 | \n", "0.000159 | \n", "0.000174 | \n", "0.000158 | \n", "0.000109 | \n", "
4 | \n", "4 | \n", "ADV18981201-V02-01-page20.txt | \n", "0.000237 | \n", "0.000074 | \n", "0.055981 | \n", "0.000242 | \n", "0.000162 | \n", "0.000824 | \n", "0.000101 | \n", "0.000341 | \n", "... | \n", "0.000090 | \n", "0.000140 | \n", "0.000115 | \n", "0.008196 | \n", "0.000095 | \n", "0.000103 | \n", "0.000128 | \n", "0.000140 | \n", "0.000128 | \n", "0.000088 | \n", "
5 rows × 252 columns
\n", "\n", " | mallet_topic_id | \n", "browser_topic_id | \n", "topic_label | \n", "topic_words | \n", "Unnamed: 4 | \n", "Unnamed: 5 | \n", "
---|---|---|---|---|---|---|
0 | \n", "5 | \n", "6 | \n", "[unclear] | \n", "thing know think way man want let doe right go... | \n", "5 | \n", "0.10352 | \n", "
1 | \n", "12 | \n", "13 | \n", "[unclear] | \n", "duty good let person think case christian mind... | \n", "12 | \n", "0.05102 | \n", "
2 | \n", "80 | \n", "81 | \n", "[unclear] | \n", "iii testimony chap vii viii xiv xii verse xvi ... | \n", "80 | \n", "0.01027 | \n", "
3 | \n", "99 | \n", "100 | \n", "[unclear] | \n", "christ father son word man thing world unto jo... | \n", "99 | \n", "0.01838 | \n", "
4 | \n", "102 | \n", "103 | \n", "[unclear] | \n", "lord brother sister help truth blessing heart ... | \n", "102 | \n", "0.06279 | \n", "