Measure Variation in Topic Keys

Last check to confirm that the model is good.

In [2]:
%load_ext autoreload
%autoreload 1
In [3]:
import Levenshtein
from matplotlib import pyplot as plt
import os
import pandas as pd
import seaborn as sns
import sklearn.metrics
In [4]:
%matplotlib inline 
dataDir = "../../data/"
In [92]:
def get_smallest_distance(target_word, listofWords):
    values = []
    for word in listofWords:
        values.append(Levenshtein.distance(target_word, word))
    return min(values)
    
def compare_topics(topicA, topicB):
    lowest_values = []
    for target_word in topicA:
        lowest_values.append(get_smallest_distance(target_word, topicB))
    return sum(lowest_values) / 20

def compare_models(modelA, modelB):
    comparisons = {}
    for i in range(250):
        topicA = modelA[i][:-1]
#         print(topicA)
        values = {}
        for j in range(250):
            topicB = modelB[j][:-1]
            distance = compare_topics(topicA, topicB)
            values["B_{}".format(j)] = distance
        comparisons["A_{}".format(i)] = values
    return pd.DataFrame.from_dict(comparisons).unstack().reset_index()


def isolate_best_pairs(df):
    return (df.groupby(['level_0'])
            .apply(lambda x: x.nsmallest(1, columns=0, keep='first'))
            .sort_values(by=0, ascending=False)
            .drop_duplicates('level_0', keep='last'))


def print_topic_pairs(modelA, modelB, topicA, topicB):
    print("Target: {} \nBest Match: {}".format(modelA.loc[topicA][2], modelB.loc[topicB][2]))
    

def compute_percentage(df, threshold):
    return len(df[df[0] <= threshold])/len(df)


def process_topicKey_pairs(dfA, dfB):
    topicsA = dfA[2].str.split(' ')
    topicsB = dfB[2].str.split(' ')
    
    comparison = compare_models(topicsA, topicsB)
    
    return isolate_best_pairs(comparison)
    

Target Model

In [79]:
target = pd.read_table(os.path.join(dataDir, 'target_300_10.18497.topicKeys.txt'), header=None)
In [81]:
target[:10]
Out[81]:
0 1 2
0 0 0.02981 book canvasser order canvassing brother week s...
1 1 0.00927 cup water bread egg add cream salt flour milk ...
2 2 0.03660 ing tion ment sign ness com ter tions great en...
3 3 0.03036 death man blood men like life hand victim poor...
4 4 0.02033 shall lord unto faith thing hope trial christ ...
5 5 0.10352 thing know think way man want let doe right go...
6 6 0.01276 king daniel babylon jerusalem lord kingdom neb...
7 7 0.04291 ing lie review end herald ill tie ile tho good...
8 8 0.01902 prayer lord heart spirit blessing let church m...
9 9 0.01000 meeting church brother lord sabbath truth held...

Random

In [80]:
random = pd.read_table(os.path.join(dataDir, 'random.16195.topicKeys.txt'), header=None)
In [83]:
target_random = process_topicKey_pairs(target, random)
In [93]:
print_topic_pairs(target, random, 211, 214)
Target: church luther reformation persecution reformer faith protestant history century great pope death doctrine inquisition men john martyr rome truth heretic  
Best Match: england slave slavery persecution colony puritan church new death john history english quaker free prison baptist person massachusetts master old 
In [94]:
print_topic_pairs(target, random, 163, 56)
Target: war peace nation world men great shall earth europe battle let sword armageddon army prophecy preparation spirit strife conflict international  
Best Match: war peace nation world europe great germany power france england russia men army state military navy international united government european 
In [95]:
print_topic_pairs(target, random, 57, 214)
Target: england london english king great british queen country new colony scotland america ireland britain house john old parliament william sir  
Best Match: england slave slavery persecution colony puritan church new death john history english quaker free prison baptist person massachusetts master old 
In [96]:
compute_percentage(target_random, 2.3)
Out[96]:
0.96

Test 300 25

In [97]:
test30025 = pd.read_table(os.path.join(dataDir, 'test_300_25.9075.topicKeys.txt'), header=None)
In [98]:
target_test30025 = process_topicKey_pairs(target, test30025)
In [105]:
target_test30025[:15]
Out[105]:
level_0 level_1 0
level_0
A_85 58692 A_85 B_47 2.70
A_48 48415 A_48 B_247 2.65
A_124 7386 A_124 B_220 2.65
A_227 35968 A_227 B_70 2.50
A_221 34467 A_221 B_7 2.50
A_106 2417 A_106 B_249 2.40
A_29 43033 A_29 B_128 2.40
A_159 16772 A_159 B_118 2.35
A_212 31753 A_212 B_100 2.30
A_89 59594 A_89 B_183 2.30
A_63 52685 A_63 B_40 2.25
A_155 15840 A_155 B_18 2.25
A_146 13324 A_146 B_165 2.25
A_123 7234 A_123 B_85 2.20
A_136 10560 A_136 B_152 2.20
In [99]:
print_topic_pairs(target, test30025, 136, 152)
Target: hour sleep night rest morning clock bed minute evening half long tired life breakfast meal week early mind good worry  
Best Match: exercise sleep health life hour physical rest body mental brain nervous mind strength habit night nerve good man nature condition 
In [100]:
print_topic_pairs(target, test30025, 85, 47)
Target: game amusement play theater pleasure ball dance social playing sport entertainment recreation picture dancing card party place evening young club  
Best Match: city woman men evil crime moral public theater game society new vice york picture social amusement play street thing country 
In [101]:
compute_percentage(target_test30025, 2.3)
Out[101]:
0.968

Test 300 no_max

In [103]:
test_300noMax = pd.read_table(os.path.join(dataDir, 'test_300_noMax.18040.topicKeys.txt'), header=None)
In [104]:
target_test300noMax = process_topicKey_pairs(target, test_300noMax)
In [107]:
target_test300noMax[:15]
Out[107]:
level_0 level_1 0
level_0
A_54 50030 A_54 B_125 2.85
A_32 44098 A_32 B_187 2.85
A_48 48491 A_48 B_91 2.65
A_29 43102 A_29 B_190 2.60
A_56 50643 A_56 B_227 2.55
A_124 7318 A_124 B_16 2.50
A_167 19187 A_167 B_42 2.50
A_218 33310 A_218 B_152 2.45
A_227 35981 A_227 B_82 2.40
A_39 45854 A_39 B_192 2.35
A_134 10123 A_134 B_209 2.35
A_169 19575 A_169 B_166 2.35
A_171 20489 A_171 B_9 2.35
A_93 60894 A_93 B_228 2.35
A_183 23644 A_183 B_228 2.35
In [75]:
print_topic_pairs(target_topics, test_300noMax_topics, 54, 125)
Target: spring berrien mich office michigan address lake_union_conference secretary lake_union herald missionary emmanuel college blosser president conference russell word sent allen_moon  
Best Match: address michigan mich president office chicago illinois lake wisconsin conference spring conference_office miss berrien secretary lake_union_herald sec north church ill 
In [76]:
print_topic_pairs(target_topics, test_300noMax_topics, 183, 228)
Target: disease alcohol life cause case fact result age effect person condition cancer experiment physical mental physician death increase men medical  
Best Match: disease germ health tuberculosis case infection epidemic patient child cause medical death fever physician life hygiene plague typhoid diphtheria water 
In [108]:
compute_percentage(target_test300noMax, 2.3)
Out[108]:
0.94
In [ ]: