Last check to confirm that the model is good.

%load_ext autoreload
%autoreload 1

import Levenshtein
from matplotlib import pyplot as plt
import os
import pandas as pd
import seaborn as sns
import sklearn.metrics

%matplotlib inline 
dataDir = "../../data/"

def get_smallest_distance(target_word, listofWords):
    values = []
    for word in listofWords:
        values.append(Levenshtein.distance(target_word, word))
    return min(values)
    
def compare_topics(topicA, topicB):
    lowest_values = []
    for target_word in topicA:
        lowest_values.append(get_smallest_distance(target_word, topicB))
    return sum(lowest_values) / 20

def compare_models(modelA, modelB):
    comparisons = {}
    for i in range(250):
        topicA = modelA[i][:-1]
#         print(topicA)
        values = {}
        for j in range(250):
            topicB = modelB[j][:-1]
            distance = compare_topics(topicA, topicB)
            values["B_{}".format(j)] = distance
        comparisons["A_{}".format(i)] = values
    return pd.DataFrame.from_dict(comparisons).unstack().reset_index()


def isolate_best_pairs(df):
    return (df.groupby(['level_0'])
            .apply(lambda x: x.nsmallest(1, columns=0, keep='first'))
            .sort_values(by=0, ascending=False)
            .drop_duplicates('level_0', keep='last'))


def print_topic_pairs(modelA, modelB, topicA, topicB):
    print("Target: {} \nBest Match: {}".format(modelA.loc[topicA][2], modelB.loc[topicB][2]))
    

def compute_percentage(df, threshold):
    return len(df[df[0] <= threshold])/len(df)


def process_topicKey_pairs(dfA, dfB):
    topicsA = dfA[2].str.split(' ')
    topicsB = dfB[2].str.split(' ')
    
    comparison = compare_models(topicsA, topicsB)
    
    return isolate_best_pairs(comparison)

Target Model¶

target = pd.read_table(os.path.join(dataDir, 'target_300_10.18497.topicKeys.txt'), header=None)

target[:10]

Random¶

random = pd.read_table(os.path.join(dataDir, 'random.16195.topicKeys.txt'), header=None)

target_random = process_topicKey_pairs(target, random)

print_topic_pairs(target, random, 211, 214)

Target: church luther reformation persecution reformer faith protestant history century great pope death doctrine inquisition men john martyr rome truth heretic  
Best Match: england slave slavery persecution colony puritan church new death john history english quaker free prison baptist person massachusetts master old

print_topic_pairs(target, random, 163, 56)

Target: war peace nation world men great shall earth europe battle let sword armageddon army prophecy preparation spirit strife conflict international  
Best Match: war peace nation world europe great germany power france england russia men army state military navy international united government european

print_topic_pairs(target, random, 57, 214)

Target: england london english king great british queen country new colony scotland america ireland britain house john old parliament william sir  
Best Match: england slave slavery persecution colony puritan church new death john history english quaker free prison baptist person massachusetts master old

compute_percentage(target_random, 2.3)

0.96

Test 300 25¶

test30025 = pd.read_table(os.path.join(dataDir, 'test_300_25.9075.topicKeys.txt'), header=None)

target_test30025 = process_topicKey_pairs(target, test30025)

target_test30025[:15]

print_topic_pairs(target, test30025, 136, 152)

Target: hour sleep night rest morning clock bed minute evening half long tired life breakfast meal week early mind good worry  
Best Match: exercise sleep health life hour physical rest body mental brain nervous mind strength habit night nerve good man nature condition

print_topic_pairs(target, test30025, 85, 47)

Target: game amusement play theater pleasure ball dance social playing sport entertainment recreation picture dancing card party place evening young club  
Best Match: city woman men evil crime moral public theater game society new vice york picture social amusement play street thing country

compute_percentage(target_test30025, 2.3)

0.968

Test 300 no_max¶

test_300noMax = pd.read_table(os.path.join(dataDir, 'test_300_noMax.18040.topicKeys.txt'), header=None)

target_test300noMax = process_topicKey_pairs(target, test_300noMax)

target_test300noMax[:15]

print_topic_pairs(target_topics, test_300noMax_topics, 54, 125)

Target: spring berrien mich office michigan address lake_union_conference secretary lake_union herald missionary emmanuel college blosser president conference russell word sent allen_moon  
Best Match: address michigan mich president office chicago illinois lake wisconsin conference spring conference_office miss berrien secretary lake_union_herald sec north church ill

print_topic_pairs(target_topics, test_300noMax_topics, 183, 228)

Target: disease alcohol life cause case fact result age effect person condition cancer experiment physical mental physician death increase men medical  
Best Match: disease germ health tuberculosis case infection epidemic patient child cause medical death fever physician life hygiene plague typhoid diphtheria water

compute_percentage(target_test300noMax, 2.3)

0.94

	0	1	2
0	0	0.02981	book canvasser order canvassing brother week s...
1	1	0.00927	cup water bread egg add cream salt flour milk ...
2	2	0.03660	ing tion ment sign ness com ter tions great en...
3	3	0.03036	death man blood men like life hand victim poor...
4	4	0.02033	shall lord unto faith thing hope trial christ ...
5	5	0.10352	thing know think way man want let doe right go...
6	6	0.01276	king daniel babylon jerusalem lord kingdom neb...
7	7	0.04291	ing lie review end herald ill tie ile tho good...
8	8	0.01902	prayer lord heart spirit blessing let church m...
9	9	0.01000	meeting church brother lord sabbath truth held...

		level_0	level_1	0
level_0
A_85	58692	A_85	B_47	2.70
A_48	48415	A_48	B_247	2.65
A_124	7386	A_124	B_220	2.65
A_227	35968	A_227	B_70	2.50
A_221	34467	A_221	B_7	2.50
A_106	2417	A_106	B_249	2.40
A_29	43033	A_29	B_128	2.40
A_159	16772	A_159	B_118	2.35
A_212	31753	A_212	B_100	2.30
A_89	59594	A_89	B_183	2.30
A_63	52685	A_63	B_40	2.25
A_155	15840	A_155	B_18	2.25
A_146	13324	A_146	B_165	2.25
A_123	7234	A_123	B_85	2.20
A_136	10560	A_136	B_152	2.20

		level_0	level_1	0
level_0
A_54	50030	A_54	B_125	2.85
A_32	44098	A_32	B_187	2.85
A_48	48491	A_48	B_91	2.65
A_29	43102	A_29	B_190	2.60
A_56	50643	A_56	B_227	2.55
A_124	7318	A_124	B_16	2.50
A_167	19187	A_167	B_42	2.50
A_218	33310	A_218	B_152	2.45
A_227	35981	A_227	B_82	2.40
A_39	45854	A_39	B_192	2.35
A_134	10123	A_134	B_209	2.35
A_169	19575	A_169	B_166	2.35
A_171	20489	A_171	B_9	2.35
A_93	60894	A_93	B_228	2.35
A_183	23644	A_183	B_228	2.35