Measure Variation in Topic Keys
Last check to confirm that the model is good.
In [2]:
%load_ext autoreload
%autoreload 1
In [3]:
import Levenshtein
from matplotlib import pyplot as plt
import os
import pandas as pd
import seaborn as sns
import sklearn.metrics
In [4]:
%matplotlib inline
dataDir = "../../data/"
In [92]:
def get_smallest_distance(target_word, listofWords):
values = []
for word in listofWords:
values.append(Levenshtein.distance(target_word, word))
return min(values)
def compare_topics(topicA, topicB):
lowest_values = []
for target_word in topicA:
lowest_values.append(get_smallest_distance(target_word, topicB))
return sum(lowest_values) / 20
def compare_models(modelA, modelB):
comparisons = {}
for i in range(250):
topicA = modelA[i][:-1]
# print(topicA)
values = {}
for j in range(250):
topicB = modelB[j][:-1]
distance = compare_topics(topicA, topicB)
values["B_{}".format(j)] = distance
comparisons["A_{}".format(i)] = values
return pd.DataFrame.from_dict(comparisons).unstack().reset_index()
def isolate_best_pairs(df):
return (df.groupby(['level_0'])
.apply(lambda x: x.nsmallest(1, columns=0, keep='first'))
.sort_values(by=0, ascending=False)
.drop_duplicates('level_0', keep='last'))
def print_topic_pairs(modelA, modelB, topicA, topicB):
print("Target: {} \nBest Match: {}".format(modelA.loc[topicA][2], modelB.loc[topicB][2]))
def compute_percentage(df, threshold):
return len(df[df[0] <= threshold])/len(df)
def process_topicKey_pairs(dfA, dfB):
topicsA = dfA[2].str.split(' ')
topicsB = dfB[2].str.split(' ')
comparison = compare_models(topicsA, topicsB)
return isolate_best_pairs(comparison)
Target Model¶
In [79]:
target = pd.read_table(os.path.join(dataDir, 'target_300_10.18497.topicKeys.txt'), header=None)
In [81]:
target[:10]
Out[81]:
Random¶
In [80]:
random = pd.read_table(os.path.join(dataDir, 'random.16195.topicKeys.txt'), header=None)
In [83]:
target_random = process_topicKey_pairs(target, random)
In [93]:
print_topic_pairs(target, random, 211, 214)
In [94]:
print_topic_pairs(target, random, 163, 56)
In [95]:
print_topic_pairs(target, random, 57, 214)
In [96]:
compute_percentage(target_random, 2.3)
Out[96]:
Test 300 25¶
In [97]:
test30025 = pd.read_table(os.path.join(dataDir, 'test_300_25.9075.topicKeys.txt'), header=None)
In [98]:
target_test30025 = process_topicKey_pairs(target, test30025)
In [105]:
target_test30025[:15]
Out[105]:
In [99]:
print_topic_pairs(target, test30025, 136, 152)
In [100]:
print_topic_pairs(target, test30025, 85, 47)
In [101]:
compute_percentage(target_test30025, 2.3)
Out[101]:
Test 300 no_max¶
In [103]:
test_300noMax = pd.read_table(os.path.join(dataDir, 'test_300_noMax.18040.topicKeys.txt'), header=None)
In [104]:
target_test300noMax = process_topicKey_pairs(target, test_300noMax)
In [107]:
target_test300noMax[:15]
Out[107]:
In [75]:
print_topic_pairs(target_topics, test_300noMax_topics, 54, 125)
In [76]:
print_topic_pairs(target_topics, test_300noMax_topics, 183, 228)
In [108]:
compute_percentage(target_test300noMax, 2.3)
Out[108]:
In [ ]: