Export top docs per topic

In [1]:
import pandas as pd
import os
In [2]:
dataDir = "../../data/"
files = ["target_300_10.Holdout.18497.docTopics.txt", "target_300_10.18497.docTopics.txt", "TopicLabels.csv"]
In [3]:
dt = pd.read_table(os.path.join(dataDir, files[1]), sep='\t', header=None)
In [4]:
dt_holdouts = pd.read_table(os.path.join(dataDir, files[0]), sep='\t',  skiprows=1, header=None)
In [5]:
dt = dt.append(dt_holdouts)
In [6]:
dt[:5]
Out[6]:
0 1 2 3 4 5 6 7 8 9 ... 242 243 244 245 246 247 248 249 250 251
0 0 ADV18981201-V02-01-page12.txt 0.000247 0.000077 0.000303 0.000252 0.000168 0.000858 0.000106 0.000355 ... 0.000093 0.000146 0.000120 0.000251 0.000099 0.008392 0.000134 0.000146 0.000133 0.000091
1 1 ADV18981201-V02-01-page13.txt 0.000279 0.000087 0.000343 0.000285 0.037680 0.010343 0.000120 0.000402 ... 0.000106 0.000165 0.000135 0.000284 0.000112 0.000121 0.000151 0.000165 0.000150 0.000103
2 2 ADV18981201-V02-01-page15.txt 0.000265 0.000082 0.000325 0.000269 0.000180 0.009792 0.000113 0.000381 ... 0.000100 0.000157 0.000128 0.000269 0.000106 0.000115 0.000143 0.000157 0.000142 0.000098
3 3 ADV18981201-V02-01-page16.txt 0.000293 0.000091 0.000360 0.000299 0.000200 0.001018 0.000125 0.000422 ... 0.000111 0.000173 0.000142 0.000298 0.000118 0.000127 0.000159 0.000174 0.000158 0.000109
4 4 ADV18981201-V02-01-page20.txt 0.000237 0.000074 0.055981 0.000242 0.000162 0.000824 0.000101 0.000341 ... 0.000090 0.000140 0.000115 0.008196 0.000095 0.000103 0.000128 0.000140 0.000128 0.000088

5 rows × 252 columns

In [7]:
topics = ['doc_id', "filename"]
topics.extend(list(range(250)))
topics[:10]
Out[7]:
['doc_id', 'filename', 0, 1, 2, 3, 4, 5, 6, 7]
In [8]:
dt.columns = topics
In [9]:
dt[:5]
Out[9]:
doc_id filename 0 1 2 3 4 5 6 7 ... 240 241 242 243 244 245 246 247 248 249
0 0 ADV18981201-V02-01-page12.txt 0.000247 0.000077 0.000303 0.000252 0.000168 0.000858 0.000106 0.000355 ... 0.000093 0.000146 0.000120 0.000251 0.000099 0.008392 0.000134 0.000146 0.000133 0.000091
1 1 ADV18981201-V02-01-page13.txt 0.000279 0.000087 0.000343 0.000285 0.037680 0.010343 0.000120 0.000402 ... 0.000106 0.000165 0.000135 0.000284 0.000112 0.000121 0.000151 0.000165 0.000150 0.000103
2 2 ADV18981201-V02-01-page15.txt 0.000265 0.000082 0.000325 0.000269 0.000180 0.009792 0.000113 0.000381 ... 0.000100 0.000157 0.000128 0.000269 0.000106 0.000115 0.000143 0.000157 0.000142 0.000098
3 3 ADV18981201-V02-01-page16.txt 0.000293 0.000091 0.000360 0.000299 0.000200 0.001018 0.000125 0.000422 ... 0.000111 0.000173 0.000142 0.000298 0.000118 0.000127 0.000159 0.000174 0.000158 0.000109
4 4 ADV18981201-V02-01-page20.txt 0.000237 0.000074 0.055981 0.000242 0.000162 0.000824 0.000101 0.000341 ... 0.000090 0.000140 0.000115 0.008196 0.000095 0.000103 0.000128 0.000140 0.000128 0.000088

5 rows × 252 columns

In [10]:
labels = pd.read_csv(os.path.join(dataDir, files[2]))
labels[:5]
Out[10]:
mallet_topic_id browser_topic_id topic_label topic_words Unnamed: 4 Unnamed: 5
0 5 6 [unclear] thing know think way man want let doe right go... 5 0.10352
1 12 13 [unclear] duty good let person think case christian mind... 12 0.05102
2 80 81 [unclear] iii testimony chap vii viii xiv xii verse xvi ... 80 0.01027
3 99 100 [unclear] christ father son word man thing world unto jo... 99 0.01838
4 102 103 [unclear] lord brother sister help truth blessing heart ... 102 0.06279
In [11]:
with open(os.path.join(dataDir, 'top20docsPerTopic.txt'), 'w') as f:
    for topic in range(100,250):
        f.write("""
        # Topic: {}
        Label: {}
        
        {}
        """.format(topic, labels[labels['mallet_topic_id'] == topic].topic_label.item(), labels[labels['mallet_topic_id'] == topic].topic_words.item()))
        
        top_docs = dt.sort_values(topic, ascending=False).head(20)['filename'].tolist()
        
        for doc in top_docs:
            f.write("""## {}
            """.format(doc))
            
        f.write("---")
In [ ]: