• 微信公众号：美女很有趣。 工作之余，放松一下，关注即送10G+美女照片！

# 异常检测—LOF算法简介以及Python实现

3小时前 2次浏览

LOF(Local Outlier Factor)算法是基于密度的异常点检测算法，适合于高维数据检测。

k距离：对于点p，将其他点与之距离进行从小到大排序，第k个即为k距离
k距离邻域：到点p的距离小于等于k距离点，共k个

局部离群因子：领域内点的局部可达密度的均值除以p点的局部可达密度

局部离群因子(LOF)的大小代表该点为离群点的可信度。即因子越大，该点越可能是离群点。

# 代码示例

from scipy.spatial.distance import cdist
import numpy as np

class LOF:
def __init__(self, data, k, epsilon=1.0):
self.data = data
self.k = k
self.epsilon = epsilon
self.N = self.data.shape[0]

def get_dist(self):
# 计算欧式距离矩阵
return cdist(self.data, self.data)

def _kdist(self, arr):
# 计算k距离
inds_sort = np.argsort(arr)
neighbor_ind = inds_sort[1:self.k + 1]  # 邻域内点索引
return neighbor_ind, arr[neighbor_ind[-1]]

def get_rdist(self):
# 计算可达距离
dist = self.get_dist()
nei_kdist = np.apply_along_axis(self._kdist, 1, dist)
nei_inds, kdist = zip(*nei_kdist)
for i, k in enumerate(kdist):
ind = np.where(dist[i] < k)  # 实际距离小于k距离，则可达距离为k距离
dist[i][ind] = k
return nei_inds, dist

def get_lrd(self, nei_inds, rdist):
# 计算局部可达密度
lrd = np.zeros(self.N)
for i, inds in enumerate(nei_inds):
s = 0
for j in inds:
s += rdist[j, i]
lrd[i] = self.k / s
return lrd

def run(self):
# 计算局部离群因子
nei_inds, rdist = self.get_rdist()
lrd = self.get_lrd(nei_inds, rdist)
score = np.zeros(self.N)
for i, inds in enumerate(nei_inds):
N = len(inds)
lrd_nei = sum(lrd[inds])
score[i] = lrd_nei / self.k / lrd[i]

return score, np.where(score > self.epsilon)[0]

if __name__ == '__main__':
np.random.seed(42)
X_inliers = 0.3 * np.random.randn(100, 2)
X_inliers = np.r_[X_inliers + 2, X_inliers - 2]
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
data = np.r_[X_inliers, X_outliers]

lof = LOF(data, 5, epsilon=1.2)
score, out_ind = lof.run()
outliers = data[out_ind]

import matplotlib.pyplot as plt

plt.scatter(data[:, 0], data[:, 1], color='b')
plt.scatter(outliers[:, 0], outliers[:, 1], color='r')
plt.show()