当前位置：Gxlcms > 数据库问题 > DB Scan算法的分析与实现

DB Scan算法的分析与实现

时间：2021-07-01 10:21:17 帮助过：7人阅读

　　　　mark P as visited //已经访问

　　　　NeighborPts = regionQuery(P, eps) //计算这个点的邻域
　　　　if sizeof(NeighborPts) < MinPts //不能作为核心点
　　　　　　mark P as NOISE //标记为噪音数据
　　　　else //作为核心点，根据该点创建一个类别
　　　　　　C = next cluster
　　　　　　expandCluster(P, NeighborPts, C, eps, MinPts) //根据该核心店扩展类别

expandCluster(P, NeighborPts, C, eps, MinPts)
　　add P to cluster C //扩展类别，核心店先加入
　　for each point P‘ in NeighborPts //然后针对核心店邻域内的点，如果该点没有被访问，
　　　　if P‘ is not visited
　　　　　　mark P‘ as visited //进行访问
　　　　　　NeighborPts‘ = regionQuery(P‘, eps) //如果该点为核心点，则扩充该类别
　　　　　　if sizeof(NeighborPts‘) >= MinPts
　　　　　　　　NeighborPts = NeighborPts joined with NeighborPts‘
　　　　if P‘ is not yet member of any cluster //如果邻域内点不是核心点，并且无类别，比如噪音数据，则加入此类别
　　　　　　add P‘ to cluster C

regionQuery(P, eps) //计算邻域
　　return all points within P‘s eps-neighborhood

这个算法的时间复杂度的限制主要在于邻域的计算，如果存在索引，则能够降低时间复杂度，如果不存在索引，则邻域计算需要遍历所有点，这样时间复杂度就比较高，相当于双层的n的循环，所以时间复杂度为O（n2）；

不过针对相同时间复杂度的时间，每个人的算法的运行时间也不尽相同，如果能够做很多的优化，一些常数时间的减少也能够减少时间，由于我这里仅仅是针对伪代码的实现，没有进行数据结构的优化，相对的数据结构也是利用了k-means用的数据结构进行了一些补充，比如添加一些基于密度需要的属性，比如是否被访问，是否为噪音，是否被添加至邻域中等等，跑起来针对数据集1的规模能够很快的得出结果，但是针对数据集2的数据量30W左右的点就存在很多问题，时间很慢，甚至不能容忍，release优化下好一些，debug下基本上无法等待出结果的，所以任何程序都要进行优化的，优化到你个人的最优；

接下来是这个算法的具体实现：

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 COLORREF colorfultmp[]={RGB(192,192,192),RGB(255,255,0),RGB(227,207,87), RGB(255,153,18),RGB(235,142,85),RGB(255,227,132), RGB(255,97,0),RGB(176,224,230),RGB(65,105,230), RGB(0,255,255),RGB(56,94,15),RGB(8,46,84), RGB(128,42,42),RGB(34,139,34),RGB(160,32,240), RGB(255,0,0),RGB(176,48,96),RGB(176,23,31),RGB(0,0,0)}; void Cluster::dbscan(vector<AbstractDataPT*> pts,double eps,int MinPts) { int categroy=0; int i,length=pts.size(); vector<AbstractDataPT*> neightpts,tmpneightpts; //遍历所有的点 for (i=0;i<length;i++) { //Sleep(1000); //如果没有访问，标记为访问 if (!pts[i]->m_bVisited) { pts[i]->m_bVisited=TRUE; //计算邻域 neightpts=pts[i]->regionQuery(pts,pts[i],eps); //include //噪音标记 if (neightpts.size()<MinPts) { pts[i]->categroy=0;//NOISE pts[i]->m_isNoise=TRUE; pts[i]->m_colorPT=colorfultmp[pts[i]->categroy%18]; } else { //核心点，聚类 categroy++; pts[i]->categroy=categroy; pts[i]->m_colorPT=colorfultmp[pts[i]->categroy%18]; //邻域内的点标记已经被添加 for (int k=0;k<neightpts.size();k++) { neightpts[k]->m_isInclude=TRUE; } //继续扩大此聚集类 int j=1; //因为邻域的规模组建增大，所以利用while while (j<neightpts.size()) { //没有类别则加入该类别，扩大的类别 if (neightpts[j-1]->categroy==0) { neightpts[j-1]->categroy=categroy; neightpts[j-1]->m_colorPT=colorfultmp[neightpts[j-1]->categroy%18]; } //标记访问 if (!neightpts[j-1]->m_bVisited) { neightpts[j-1]->m_bVisited=TRUE; //计算邻域 tmpneightpts=neightpts[j-1]->regionQuery(pts,neightpts[j-1],eps); //合并核心点的邻域 if (tmpneightpts.size()>=MinPts) { //合并 int m,n; //已经添加则不进行添加 for (m=0;m<tmpneightpts.size();m++) { /* if (tmpneightpts[m]->categroy==0) { neightpts.push_back(tmpneightpts[m]); } */ //不重复添加 //感觉这里限制程序时间的瓶颈 if (!tmpneightpts[m]->m_isInclude) { neightpts.push_back(tmpneightpts[m]); tmpneightpts[m]->m_isInclude=TRUE; } /* BOOL exist=FALSE; for (n=0;n<neightpts.size();n++) { if (tmpneightpts[m]==neightpts[n]) { exist=TRUE; } } if (!exist) { neightpts.push_back(tmpneightpts[m]); } */ } } } //继续 j++; } } } } }

　　计算邻域的代码如下（循环所有点进行计算）：

1 2 3 4 5 6 7 8 9 10 11 12 13 vector<AbstractDataPT*> DataPT2D::regionQuery(vector<AbstractDataPT*> pts,AbstractDataPT* pt,double eps) { vector<AbstractDataPT*> neighbor; int i,length=pts.size(); for (i=0;i<length;i++) { if (pt->calculateD(pts[i])<=eps) { neighbor.push_back(pts[i]); } } return neighbor; }

时间表现非常慢，仍需要改进；

数据类型如下：

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 #pragma once #include <vector> using namespace std; class AbstractDataPT { public: AbstractDataPT(void); virtual ~AbstractDataPT(void); virtual double calculateD(AbstractDataPT* otherPT)=0; virtual void drawSelf(CDC* pDc)=0; virtual void drawNormalSelf(CDC* pDc)=0; virtual vector<AbstractDataPT*> calculateMeans(vector<AbstractDataPT*> pts,int k)=0; virtual double calculateE(vector<AbstractDataPT*> pts,vector<AbstractDataPT*> kMeans)=0; virtual vector<AbstractDataPT*> regionQuery(vector<AbstractDataPT*> pts,AbstractDataPT* pt,double eps)=0; public: //属于哪一个类别 int categroy; COLORREF m_colorPT; BOOL m_bVisited; BOOL m_isInclude; BOOL m_isNoise; }; #pragma once #include "abstractdatapt.h" class DataPT2D : public AbstractDataPT { public: DataPT2D(void); DataPT2D(double cx,double cy); DataPT2D(double cx,double cy,COLORREF color); ~DataPT2D(void); double calculateD(AbstractDataPT* otherPT); void drawSelf(CDC* pDc); void drawNormalSelf(CDC* pDc); vector<AbstractDataPT*> calculateMeans(vector<AbstractDataPT*> pts,int k); double calculateE(vector<AbstractDataPT*> pts,vector<AbstractDataPT*> kMeans); vector<AbstractDataPT*> regionQuery(vector<AbstractDataPT*> pts,AbstractDataPT* pt,double eps); double m_fCX; double m_fCY; };

　　release下经过N久之后终于有结果了，截图如下：

技术分享

结果有点诡异，因为颜色只定义了18种，类别循环使用的颜色

修正：第二题目预处理的时候这里忽略了一些数据的问题，所以第二个数据集聚类出效果不是很好，这里修正一下，不只是取出白色的点，这里过滤条件增强，去除稍微浅色的点，这里暂时设置如下：

1 2 3 4 if (nCR>235&&nCG>235&&nCB>235) { continue; }

这样处理还有一个好处就是过滤了一部分点，点的数量降低了；

这样就可以得到一个比较清晰的图像了，截图如下，这里图形进行了放大，所以显示出来有部分没有显示：

技术分享

这里找到了一种提高时间效率的方法，书中给出的DBSCAN提高时间效率的方法是利用空间索引，这里查到一种RTree的方法，但是实现代价比较昂贵，而且不知道如何实现，下载了RTree实现的源代码有时间学习学习这一个数据结构~

这里存在另外一种方法同样提高效率，针对这个数据同样能够降低很多时间复杂度，针对此数据实现的较好应该能够达到线性时间，这里能够在5分钟内跑出结果，其实这也是很慢的了，因为我的程序实现很多地方比较冗余，比较杂乱，程序写的修改质量不高，也不好修改，5分钟已经达到我的目标，所以我就不进行修改了，如果你看到这个利用这个原理，也许能够达到一分钟以内的哟，亲；

原理如下，针对每个点，计算其邻域的时候首先按照半径将图划分成矩形的格子；然后每个点只需要计算其周围的9个格子的内容，针对此数据可以看错做常数时间~预处理为线性；

技术分享

主要代码如下：

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 vector<AbstractDataPT*> DataPT2D::regionQuery(vector<AbstractDataPT*> pts,AbstractDataPT* pt,double eps) { int index = regionNum(pt,eps); //index-1-x_2d_max/eps,index-x_2d_max/eps;index+1-x_2d_max/eps //index-1,index,index+1 //index-1+x_2d_max/eps,index+x_2d_max/eps;index+1+x_2d_max/eps int num = x_2d_max/eps; vector<AbstractDataPT*> tmp; int i,j,k,length=pts.size(); for (i=-1;i<2;i++) { for (j=-1;j<2;j++) { int tmpindex = index+i+num*j; if (tmpindex<0) { tmpindex=0; } int size = Region[tmpindex]->regionBy.size(); for (k=0;k<size;k++) { tmp.push_back(Region[index+i+num*j]->regionBy[k]); } } } vector<AbstractDataPT*> neighbor; length=tmp.size(); for (i=0;i<length;i++) { if (pt->calculateD(tmp[i])<=eps) { neighbor.push_back(tmp[i]); } } return neighbor; } int DataPT2D::regionNum(AbstractDataPT* pt,double eps) { //转换 DataPT2D* tmpDatapt=(DataPT2D*)pt; int x_index=tmpDatapt->m_fCX/eps; int y_index=tmpDatapt->m_fCY/eps; int num = x_2d_max/eps; int result=x_index+y_index*num; return result; }

void Cluster::dbscan(vector<AbstractDataPT*> pts,double eps,int MinPts) { //清空region数据结构 Region.clear(); int categroy=0; int i,length=pts.size(); //初始化region数据结构 //这里直接将region初始化足够长，偷懒 for (i=0;i<BIGMAX;i++) { Region.push_back(new regionvector()); } //根据半径初始化Region数据 for(i=0;i<length;i++) { Region[pts[i]->regionNum(pts[i],eps)]->regionBy.push_back(pts[i]); //? } vector<AbstractDataPT*> neightpts,tmpneightpts; //遍历所有的点 for (i=0;i<length;i++) { //Sleep(1000); //如果没有访问，标记为访问 if (!pts[i]->m_bVisited) { pts[i]->m_bVisited=TRUE; //计算邻域 neightpts=pts[i]->regionQuery(pts,pts[i],eps); //include //噪音标记 if (neightpts.size()<MinPts) { pts[i]->categroy=0;//NOISE pts[i]->m_isNoise=TRUE; pts[i]->m_colorPT=colorfultmp[pts[i]->categroy%18]; } else { //核心点，聚类 categroy++; pts[i]->categroy=categroy; pts[i]->m_colorPT=colorfultmp[pts[i]->categroy%18]; //邻域内的点标记已经被添加 for (int k=0;k<neightpts.size();k++) { neightpts[k]->m_isInclude=TRUE; } //继续扩大此聚集类 int j=1; //因为邻域的规模组建增大，所以利用while while (j<neightpts.size()) { //没有类别则加入该类别，扩大的类别 if (neightpts[j-1]->categroy==0) { neightpts[j-1]->categroy=categroy; neightpts[j-1]->m_colorPT=colorfultmp[neightpts[j-1]->categroy%18]; } //标记访问 if (!neightpts[j-1]->m_bVisited) { neightpts[j-1]->m_bVisited=TRUE; //计算邻域 tmpneightpts=neightpts[j-1]->regionQuery(pts,neightpts[j-1],eps); //合并核心点的邻域 if (tmpneightpts.size()>=MinPts) { //合并 int m,n; //已经添加则不进行添加 for (m=0;m<tmpneightpts.size();m++) { /* if (tmpneightpts[m]->categroy==0) { neightpts.push_back(tmpneightpts[m]); } */ //不重复添加 //感觉这里限制程序时间的瓶颈 if (!tmpneightpts[m]->m_isInclude) { neightpts.push_back(tmpneightpts[m]); tmpneightpts[m]->m_isInclude=TRUE; } /* BOOL exist=FALSE; for (n=0;n<neightpts.size();n++) { if (tmpneightpts[m]==neightpts[n]) { exist=TRUE; } } if (!exist) { neightpts.push_back(tmpneightpts[m]);

                                          

                  

	 	
                    
                        
                            
                                < 上一篇
DB2Error
                            
                            
                                 下一篇 >
Mongodb java api操作
                            
                        
                    
                    
                    
                    
                    
                
              
                    
                
                  

                    
                        人气教程排行
                    
                    
			
                        
						
 
                                
                                    229次
                                    1
                                   oracle 用户解锁和修改用户密码                                
                            
							  								  														  
                                
                                    229次
                                    2
                                   解决：安装SQL Server 2008 Native Client遇到错误（在Navicat premium新建sqlserver连接时 需要）：An error occurred during ...HRESULT: 0x80070422（注意尾部的错误号）                                
                            
								  														  								  
                                
                                    229次
                                    3
                                   rocksdb 编译步骤                                
                            
														  								  							
                                
                                    229次
                                    4
                                   adb命令开关蓝牙及NFC                                
                            
							  								  							
                                
                                    229次
                                    5
                                   解决数据库连接错误 您在wp-config.php文件中提供的数据库用户名和密码可能不正确，或者无法连接到localhost上的数据库服务器，这意味着您的主机数据库服务器已停止工作。                                
                            
							  								  							
                                
                                    229次
                                    6
                                   使用java自动填充，实现mysql的创建修改时间的自动填充                                
                            
							  								  							
                                
                                    229次
                                    7
                                   Linux mount挂载磁盘报错  mount: wrong fs type, bad option, bad superblock on /dev/vdb                                
                            
							  								  							
                                
                                    228次
                                    8
                                   关于mysql创建数据库中字符集和排序规则的选择                                
                            
							  								  							
                                
                                    228次
                                    9
                                   Linux设置Mysql开机自启动服务                                
                            
							  								  							
                                
                                    227次
                                    10
                                   SQL2008：WITH MOVE 子句可用于重新定位一个或多个文件                                
                            
							  								  							
                                
                                    227次
                                    11
                                   图数据库Neo4j在GIS系统的应用                                
                            
							  								  							
                                
                                    227次
                                    12
                                   SpringBoot配置  druid 数据源配置   慢SQL记录                                
                            
							  								  							
                                
                                    227次
                                    13
                                   SQL state [72000]; error code [1013]; ORA-03111: 通信通道收到中断; java.sql.SQLException: ORA-01745: 无效的主机/绑定变量名;java.sql.SQLException: ORA-01013: 用户请求取消当前的操作                                
                            
							  								  							
                                
                                    227次
                                    14
                                   docker-compose启动MySQL并配置远程登录                                
                            
							  								  							
                                
                                    225次
                                    15
                                   Mysql安装（for mac）                                
                            
							  								  							
                                
                                    225次
                                    16
                                   kettle将图片转换至二进制存储至数据库                                
                            
							  								  							
                                
                                    225次
                                    17
                                   adb的安装及配置                                
                            
							  								  							
                                
                                    224次
                                    18
                                   使用PHP控制MODBUS-RTU设备                                
                            
							  								  							
                                
                                    224次
                                    19
                                   MySQL 查询时间差值大于某一个值的 记录                                
                            
							  								  							
                                
                                    224次
                                    20
                                   MongoDB内存配置 --wiredTigerCacheSizeGB                                
                            

                        
                    
                
            
            
        
    
  


   本站所有资源全部来源于网络，若本站发布的内容侵害到您的隐私或者利益，请联系我们删除！


 



登录

 










                 
                
                
                
                记住我的登录                            
        忘记密码？ 
  登录 
  




  
    
    
    
  
   
  
    
    
  
    
      
      我已阅读用户协议及版权声明
  
  
注册