• 微信公众号：美女很有趣。 工作之余，放松一下，关注即送10G+美女照片！

# vitrebi算法进行分词的案例

1个月前 (04-23) 11次浏览

# 1.viterbi算法的原理

（1）初始化

[sigma_{1}(i) = pi_{i}b_{i}(o_{1})
]

[phi_{1}(i) = 0
]

(2) 递推 对t = 2,3,…,T

[sigma_{t}(i) = max(sigma_{t-1}(j)a_{ji}) ~~1<=j<=N~~~~~ i = 1,2,…N
]

[phi_{t}(i) = argmax[sigma_{t-1}(j)a_{ji}] ~ 1<=j<=N
]

（3）终止

[p^* = max(sigma_{t}(i) ~ 1<=i<=N)
]

[i_{T}^* = argmax[sigma_{T}(i)] 1<=i <=N
]

(4) 最优路径回溯

[i_{t}^* = phi_{t+1}(t+1)
]

# 2.以一个小例子的python实现来说明上述过程

``````import numpy as np

def viterbi(A,B,pi,o):
"""

:param A: 表示状态转移矩阵
:param B:  表示观测概率矩阵
:param pi: 初始概率
:param o: 观测序列
:return:
"""
N = len(A[0]) # 状态的个数，即有多少个状态
T = len(o) # 观测的时间段有多长

delta = [[0]*N for _ in range(T)] # 用来存储每一时间下，各隐藏状态的概率
psi = [[0]*N for _ in range(T)] # 用来存储每一时间下，取最大值的的那个状态

# init
for i in range(N):
delta[0][i] = pi[i]*B[i][o[0]]  # 初始条件下（t=0）的各个状态的概率就等于初始概率乘以观测概率矩阵的第一列下各概率的值
psi[0][i] = 0  # 初始的最大值状态全部设为0

# iter

for t in range(1,T):
for i in range(N):
temp,max_index = 0,0
for j in range(N):
res = delta[t-1][j]*A[j][i]
if res > temp:
temp = res
max_index = j  # 保存取最大值时的索引
delta[t][i] = temp*B[i][o[t]]
psi[t][i] = max_index # 每个i循环时都有一个max_index

# end
p = max(delta[-1])
for i in range(N):
if delta[-1][i] == p:
i_T = i

# backtrack
path = [0]*T
i_t = i_T
for t in reversed(range(T-1)):
i_t = psi[t+1][i_t]
path[t] = i_t

path[-1] = i_T

return delta,psi,path

A = [[0.5,0.2,0.3],[0.3,0.5,0.2],[0.2,0.3,0.5]]
B = [[0.5,0.5],[0.4,0.6],[0.7,0.3]]
pi = [0.2,0.4,0.4]
o = [0,1,0]
delta,psi,path = viterbi(A,B,pi,o)
print(np.array(delta))
print(psi)
print(path) # 这里的结果2是索引，表示的就是3
``````

``````[[0.1     0.16    0.28   ]
[0.028   0.0504  0.042  ]
[0.00756 0.01008 0.0147 ]]
[[0, 0, 0], [2, 2, 2], [1, 1, 2]]
[2, 2, 2]
``````

# 3 以矩阵的形式来表示上述算法的实现过程

``````# np.multipy表示矩阵的点乘

def viterbi(A,B,pi,o):
A = np.array(A)
B = np.array(B)
pi = np.array(pi).reshape(1,len(pi))
o = np.array(o).reshape(1,len(o))

T = o.shape[1]
N = A.shape[0]

delta = np.zeros((T,N))
psi = np.zeros((T,N))

# 初始
delta[0,:] = np.multiply(pi,B[:,0])
# 迭代
for t in range(1,T):
for i in range(N):
temp = list(np.multiply(delta[t-1,:],A[:,i].T))
delta[t,i] = max(temp) * B[i,o[0,t]]
psi[t,i] = temp.index(max(temp))
# 结束
end = list(delta[-1,:]).index(max(delta[-1,:]))

# 回溯
i_t =end
path = [0]*T
for t in reversed(range(T-1)):
i_t = int(psi[t+1,i_t])
path[t] = i_t
path[-1] = end
return delta,psi,path

A = [[0.5,0.2,0.3],[0.3,0.5,0.2],[0.2,0.3,0.5]]
B = [[0.5,0.5],[0.4,0.6],[0.7,0.3]]
pi = [0.2,0.4,0.4]
o = [0,1,0]
delta,psi,path = viterbi(A,B,pi,o)
print(np.array(delta))
print(psi)
print(path) # 这里的结果2是索引，表示的就是3
``````

# 4 利用维特比算法来解码，这里参考了jieba分词中自带的转移状态矩阵和观测矩阵（发射矩阵）

``````"""
start：初始概率分布，就是第一个字的状态的概率
tran: 状态转移概率，从当前状态到下一个状态的转移的概率
emit: 发射概率，表示在某一状态下生成某个观测状态的概率
"""
import sys
import re
import getopt
sys.path.append('.\finalseg') # 将同级目录finalseg加入到路径中

MIN_FLOAT = -3.14e100

prob_start_p = "prob_start.p"
prob_trans_p = "prob_trans.p"
prob_emit_p = "prob_emit.p"

# 某一个词的状态为key时，prevstatus表示前一个词的状态的框定范围
PrevStatus = {
'B':'ES',
'M':'MB',
'S':'SE',
'E':'BM'
}
Force_Split_Words = set([])
from prob_start import P as start_p
from prob_trans import P as trans_p
from prob_emit import P as emit_p

def viterbi(obs,states,start_p,trans_p,emit_p):
V = [{}] # 这里的v就相当于原始的delta
path = {}
for y in states:   # init 获取这一句子的初始状态分布,可以理解为第一个字为上述4种状态的概率
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) # 概率相乘转变为对数相加
path[y] = [y]
# 对之后每一个字做状态转移概率的分析
for t in range(1,len(obs)):
V.append({})
newpath = {}
# 考察当前字对上一字的发射概率，取其中最大的那个
for y in states:
# 获取当前词在y状态下的发射概率
em_p = emit_p[y].get(obs[t],MIN_FLOAT) #这个就相当于B[i][o[t]]
# state表示前一个字到当前字的y状态的最大概率，prob表示这个概率
(prob,state) = max(
[
(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0)
for y0 in PrevStatus[y]
]
)

# 得到当前字的所有可能观测状态的最大概率值
V[t][y] = prob

# 更新路径，state表示当前字的前一个字到当前字的y状态的最大可能概率，所以是path[state]
newpath[y] = path[state] + [y]
path = newpath

# 最后一个要重新复盘，因为最后一个字智能是E or S
(prob,state) = max((V[len(obs) - 1][y],y) for y in 'ES')

for i in path:
print((i,path[i]))

for v_i in V:
print((V))

return (prob,path[state])

def _cut(sentence):  # 这个应该是解码
prob,pos_list = viterbi(sentence,'BMES',start_p,trans_p,emit_p)
begin,nexti = 0,0
for i,char in enumerate(sentence):
pos = pos_list[i]
if pos == 'B':
begin = i
elif pos == 'E':
yield sentence[begin:i+1]
nexti = i+1

elif pos == 'S':
yield char
nexti = i+1

if nexti < len(sentence):
yield sentence[nexti:]

re_han = re.compile("([u4E00-u9FD5]+)") # 匹配所有中文
re_skip = re.compile("([a-zA-Z0-9]+(?:.d+)?%?)")

def cut(sentence):
sentence = sentence.strip()
blocks = re_han.split(sentence)
lseg = []
for blk in blocks:
if re_han.match(blk):
for word in _cut(blk):
if word not in Force_Split_Words:
lseg.append(word)
else:
for c in word:
lseg.append(c)
else:
tmp = re_skip.split(blk)
for x in tmp:
if x:
lseg.append(x)

return lseg

if __name__=='__main__':
sen = "航天2院706所的全称是北京计算机技术及应用研究所"
rs = cut(sen)
print(" ".join(rs))
``````

``````航天 2 院 706 所 的 全称 是 北京 计算机 技术 及 应用 研究 所
``````