Export top docs per topic
In [1]:
import pandas as pd
import os
In [2]:
dataDir = "../../data/"
files = ["target_300_10.Holdout.18497.docTopics.txt", "target_300_10.18497.docTopics.txt", "TopicLabels.csv"]
In [3]:
dt = pd.read_table(os.path.join(dataDir, files[1]), sep='\t', header=None)
In [4]:
dt_holdouts = pd.read_table(os.path.join(dataDir, files[0]), sep='\t', skiprows=1, header=None)
In [5]:
dt = dt.append(dt_holdouts)
In [6]:
dt[:5]
Out[6]:
In [7]:
topics = ['doc_id', "filename"]
topics.extend(list(range(250)))
topics[:10]
Out[7]:
In [8]:
dt.columns = topics
In [9]:
dt[:5]
Out[9]:
In [10]:
labels = pd.read_csv(os.path.join(dataDir, files[2]))
labels[:5]
Out[10]:
In [11]:
with open(os.path.join(dataDir, 'top20docsPerTopic.txt'), 'w') as f:
for topic in range(100,250):
f.write("""
# Topic: {}
Label: {}
{}
""".format(topic, labels[labels['mallet_topic_id'] == topic].topic_label.item(), labels[labels['mallet_topic_id'] == topic].topic_words.item()))
top_docs = dt.sort_values(topic, ascending=False).head(20)['filename'].tolist()
for doc in top_docs:
f.write("""## {}
""".format(doc))
f.write("---")
In [ ]: