• 欢迎光临~

# elasticsearch算法之词项相似度算法(二)

[ld_{u,v}(i,j)=max(i,j); ; ; ; ; ; ; ; min(i,j) = 0 ]

name当u和v的长度都不为0的时候，存在三种转化的可能

u的不包含最后一个字符的前缀转化为v的前提下，这是删除字符的情形；

[ld_{u,v}(i,j) = ld_{u,v}(i-1,j)+1 ]

u已经整个转化为v不包含最后一个字符的前缀的前提下，这是添加字符的情形；

[ld_{u,v}(i,j) = ld_{u,v}(i,j-1)+1 ]

u前缀已经转化为v的前缀的前提下，这个时候需要根据两个词的最后一个字符是否相等，如果不等的话就需要替换；

[ld_{u,v}(i,j) = ld_{u,v}(i,j)+C_{u_{i}ne{v_{i}}},;; C_{u_{i}ne{v_{i}}} = left{begin{matrix} 1, u_{i}ne v_{i} \ 0, u_{i}=v_{i} \ end{matrix}right. ]

v => v,编辑距离为0；

vl => v,编辑距离为1；

[ld_{vlna,vlan}(0,1) = ld_{v,vl} = 1 ]

[ld_{vlna,vlan}(1,1)= ld_{vl,vl} = ld_{v,vl} + 1 = 2 ]

[ld_{vlna,vlan}(1,0) = ld_{vl,v} = 1 ]

[ld_{vlna,vlan}(1,1)= ld_{vl,vl} = ld_{vl,v} + 1 = 2 ]

[ld_{vlna,vlan}(0,0) = ld_{v,v} = 0 ]

[ld_{vlna,vlan}(1,1)= ld_{vl,vl} = ld_{v,v} + 0 = 0 ]

``````import copy
import pandas as pd

def levenshtein_edit_distance(u, v):
u = u.lower()
v = v.lower()
distance = 0

if len(u) == 0:
distance = len(v)
elif len(v) == 0:
distance = len(u)
else:
edit_matrix = []
pre_row = [0] * (len(v) + 1)
current_row = [0] * (len(v) + 1)

# 初始化补白行的编辑距离
for i in range(len(u) +1):
pre_row[i] = i

for i in range(len(u)):
current_row[0] = i + 1
for j in range(len(v)):
cost = 0 if u[i] == v[j] else 1
current_row[j+1] = min(current_row[j] + 1, pre_row[j+1] + 1, pre_row[j] + cost)

for j in range(len(pre_row)):
pre_row[j] = current_row[j]

edit_matrix.append(copy.copy(current_row))

distance = current_row[len(v)]
edit_matrix = np.array(edit_matrix)
edit_matrix = edit_matrix.T
edit_matrix = edit_matrix[1:,]
edit_matrix = pd.DataFrame(data = edit_matrix, index=list(v), columns=list(u))

return distance,edit_matrix
``````

``````vlan = 'vlan'
vlna = 'vlna'
http='http'
words = [vlan, vlna, http]

input_word = 'vlna'
for word in words:
distance, martrix = levenshtein_edit_distance(input_word, word)
print(f"{input_word} and {word} levenshtein edit distance is  {distance}")
print('the complate edit distance matrix')
print(martrix)

vlna and vlan levenshtein edit distance is  2
the complate edit distance matrix
v  l  n  a
v  0  1  2  3
l  1  0  1  2
a  2  1  1  1
n  3  2  1  2
vlna and vlna levenshtein edit distance is  0
the complate edit distance matrix
v  l  n  a
v  0  1  2  3
l  1  0  1  2
n  2  1  0  1
a  3  2  1  0
vlna and http levenshtein edit distance is  4
the complate edit distance matrix
v  l  n  a
h  1  2  3  4
t  2  2  3  4
t  3  3  3  4
p  4  4  4  4

``````

[cs(u,v) = costheta = frac{ucdot v }{|u| |v|} = frac{sum_{i=1}^{n} u_{i}v_{i} }{sqrt{sum_{i=1}^{n} u_{i}^{2} } sqrt{sum_{i=1}^{n} v_{i}^{2} }} ]

[cs(u,v)= 1- cs(u,v) =1- costheta =1- frac{ucdot v }{|u| |v|} =1- frac{sum_{i=1}^{n} u_{i}v_{i} }{sqrt{sum_{i=1}^{n} u_{i}^{2} } sqrt{sum_{i=1}^{n} v_{i}^{2} }} ]

``````from scipy.stats import itemfreq

def boc_term_vectors(words):
words = [word.lower() for word in words]
unique_chars = np.unique(np.hstack([list(word) for word in words]))
word_term_counts = [{char:count for char,count in itemfreq(list(word))} for word in words]

boc_vectors = [np.array([
int(word_term_count.get(char, 0)) for char in unique_chars])
for   word_term_count in  word_term_counts
]

return list(unique_chars), boc_vectors
``````

``````vlan = 'vlan'
vlna = 'vlna'
http='http'
words = [vlan, vlna, http]

chars, (boc_vlan,boc_vlna,boc_http) = boc_term_vectors(words)
print(f'all chars {chars}')
print(f"vlan {boc_vlan}")
print(f"vlna {boc_vlna}")
print(f"http {boc_http}")

all chars ['a', 'h', 'l', 'n', 'p', 't', 'v']
vlan [1 0 1 1 0 0 1]
vlna [1 0 1 1 0 0 1]
http [0 1 0 0 1 2 0]
``````

``````def cosin_distance(u, v):
distance = 1.0 - np.dot(u,v)/(np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v))))
return distance
``````

``````vlan = 'vlan'
vlna = 'vlna'
http='http'
words = [vlan, vlna, http]

chars, boc_words = boc_term_vectors(words)
input_word =vlna
boc_input = boc_words[1]
for word, boc_word in zip(words, boc_words):
print(f'{input_word} and {word} cosine distance id {cosin_distance(boc_input, boc_word)}')

vlna and vlan cosine distance id 0.0
vlna and vlna cosine distance id 0.0
vlna and http cosine distance id 1.0
``````