DBSCAN

    技术2024-07-10  74

    1、KD树的构造(ongoing)

    #DBSCAN inspects abnormal sample import numpy as np from heapq import heappush, heappop, nsmallest, heappushpop from scipy.spatial import KDTree  import matplotlib.pyplot as plt

    decisionNode = dict(boxstyle="sawtooth", fc="0.8") leafNode = dict(boxstyle="round4", fc="0.8") arrow_args = dict(arrowstyle="<-")

    def getNumLeafs(mytree):     numLeafs = 0     for key in (mytree.less, mytree.greater):         if type(key).__name__=='innerNode':#test to see if the nodes are dictonaires, if not they are leaf nodes             numLeafs += getNumLeafs(key)         else:   numLeafs +=1              return numLeafs

    def getTreeDepth(mytree):     maxDepth = 0     for key in (mytree.less, mytree.greater):         if type(key).__name__=='innerNode':#test to see if the nodes are dictonaires, if not they are leaf nodes             thisDepth = 1 + getTreeDepth(key)         else:   thisDepth = 1         if thisDepth > maxDepth: maxDepth = thisDepth     return maxDepth

    def plotNode(nodeTxt, centerPt, parentPt, nodeType):     createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',              xytext=centerPt, textcoords='axes fraction',              va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )      def plotMidText(cntrPt, parentPt, txtString):     xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]     yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]     createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

    def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on     numLeafs = getNumLeafs(myTree)  #this determines the x width of this tree     depth = getTreeDepth(myTree)     #firstStr = list(myTree.keys())[0]     #the text label for this node should be this     cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)     #plotMidText(cntrPt, parentPt, nodeTxt)     plotNode(myTree.pivot_idx, cntrPt, parentPt, decisionNode)     #secondDict = myTree[firstStr]     plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD     for key in (myTree.less, myTree.greater):         if type(key).__name__=='innerNode':#test to see if the nodes are dictonaires, if not they are leaf nodes                plotTree(key, cntrPt, key.split_dim)        #recursion         else:   #it's a leaf node print the leaf node                          plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW             if key:                 plotNode(key.pivot_idx, (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)             #plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))     plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD #if you do get a dictonary you know it's a tree, and the first element will be another dict

    def createPlot(inTree):     fig = plt.figure(1, facecolor='white')     fig.clf()     axprops = dict(xticks=[], yticks=[])     createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    #no ticks     #createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses      plotTree.totalW = float(getNumLeafs(inTree))     plotTree.totalD = float(getTreeDepth(inTree))     plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;     plotTree(inTree, (0.5,1.0), '')     plt.show()

    def minkowski_distance_p(x, y, p):     d = x - y     return np.power(np.sum(d**p), 1/p)      class KdTree:     def __init__(self, data):         self.data = np.asarray(data)         self.m, self.n = self.data.shape         self.root = self.__build(np.arange(self.m), 0, None)         self.current_node = None              class innerNode:         def __init__(self, split_dim, pivot, pivot_idx, height, less, greater, parent):             self.split_dim = split_dim             self.pivot = pivot             self.pivot_idx = pivot_idx             self.height = height             self.less = less             self.greater = greater             self.parent = parent             self.visit = 0          class leafNode(innerNode):         def __init__(self, split_dim, pivot, pivot_idx, height, less, greater, parent):             super().__init__(split_dim, pivot, pivot_idx, height, less, greater, parent)

        def __build(self, idx, height, parent):         split_dim = height % self.n         #print(self.data[idx])         if len(idx) == 0:             return None         if len(idx) == 1:             return KdTree.leafNode(split_dim, self.data[idx[0]][split_dim], idx[0], height, None, None, parent)                  data = self.data[idx]         data = data[:, split_dim]                  if len(data)%2 == 0:             pivot = np.median(np.append(data, data[0]))         else:             pivot = np.median(data)         pivot_idx = idx[np.argwhere(data==pivot)[0][0]]         less_idx = np.nonzero(data<pivot)[0]         greater_idx = np.nonzero(data>pivot)[0]         #note:下面函数应该传入idx[less_idx] idx[greater_idx],而不是less_idx greater_idx         p = KdTree.innerNode(split_dim, pivot, pivot_idx, height, None, None, parent)         p.less = self.__build(idx[less_idx], height + 1, p)         p.greater = self.__build(idx[greater_idx], height + 1, p)         return p              """         # 递归awesome         return KDTree.KDNode(split_dim, pivot, pivot_idx, height,                                self.__build(idx[less_idx], height + 1, self.KDNode),                               self.__build(idx[greater_idx], height + 1, self.KDNode), parent)         """              def __findLeaf(self, x, root):         node = root         while node:             leaf = node             if node.pivot < x[node.split_dim]:                 node = node.less             else:                 node = node.greater                  return leaf          def __push(self, neighbors, x, k, node, p):         d = minkowski_distance_p(x[np.newaxis,:], self.data[node.pivot_idx], p)         if len(neighbors) < k:             heappush(neighbors, (-d, node.pivot_idx))         else:             heappushpop(neighbors, (-d, node.pivot_idx))

        def __query(self, x , neighbors, innernode, label, k, p):         innernode.visit += 1         if label[innernode.pivot_idx]:             return         while innernode:             if not label[innernode.pivot_idx]:                 label[innernode.pivot_idx] = 1                 self.__push(neighbors, x, k, innernode, p)                 largest_in_neighbors = -nsmallest(1, neighbors)[0][0]                 dis_far_split_axis = np.abs(x[innernode.split_dim]-innernode.pivot)                 if dis_far_split_axis < largest_in_neighbors or len(neighbors)<k:                     if innernode.less and not label[innernode.less.pivot_idx]:                         leaf = self.__findLeaf(x, innernode.less)                         self.__query(x, neighbors, leaf, label, k, p)                     if innernode.greater and not label[innernode.greater.pivot_idx]:                         leaf = self.__findLeaf(x, innernode.greater)                         self.__query(x, neighbors, leaf, label, k, p)                          innernode = innernode.parent              return neighbors          def query(self, x, k=1, p=2):         neighbors = []         node = self.root         label = np.zeros(len(self.data), dtype=np.int16)         leaf = self.__findLeaf(x, node)         label[leaf.pivot_idx] = 1         self.__push(neighbors, x, k, leaf, p)         self.__query(x , neighbors, leaf.parent, label, k, p)         return neighbors              def inorder(self, root):         if root is None:             return         self.inorder(root.less)         print(root.height)         self.inorder(root.greater)

    class DBScan:     def __init__(self, epsilon, minPts):         self.epsilon = epsilon         self.minPts = minPts          if __name__ == "__main__":     data = np.random.randn(700).reshape((100, -1))     #data = np.array([[ 0.74728798, 0.81022863, -0.19179337, 0.878292  ],     #                 [-2.13781247, 0.91024753, 0.09538944, -0.29745797],     #                 [ 0.45066661, -0.27623008, 0.15601932, -1.97192213],     #                 [ 0.79890978, 2.01713301, -0.00664947, -0.37733724],     #                 [-0.75239458, 0.56911767, 1.31537443, -0.6950948 ]])     #print(data)     #print(data)     x = np.random.randn(7)     print(x)     kd = KdTree(data)     kd.inorder(kd.root)     #kd1 = KDTree(data)     #print("current:", getNumLeafs(kd.root))     createPlot(kd.root)

    Processed: 0.015, SQL: 9