当前位置:Gxlcms > PHP教程 > PHP敏感词过滤

PHP敏感词过滤

时间:2021-07-01 10:21:17 帮助过:20人阅读

  1. /**
  2. * 禁词过滤
  3. * 执行效率:每篇用时0.05秒
  4. * @author liuxu
  5. *
  6. */
  7. class Logic_BlackWord
  8. {
  9. const APP_FORUM = 1;
  10. const APP_BLOG = 2;
  11. const APP_VOTE = 3;
  12. /**
  13. * 过滤得到禁词
  14. * @param unknown $txt
  15. * @return Ambigous
  16. */
  17. public function getHitList($txt)
  18. {
  19. $hitList = array();
  20. //对禁词分批过滤
  21. $max = $this->getMax();
  22. if($max)
  23. {
  24. $size = 1000;
  25. $last = ceil($max/$size);
  26. for($page=1;$page<=$last;$page++)
  27. {
  28. $result = $this->getHitListByPage($txt,$page,$size);
  29. if($result) $hitList = array_merge($hitList,$result);
  30. }
  31. }
  32. $hitList2 = array();
  33. foreach($hitList as $hit=>$type)
  34. {
  35. $hitList2[$type][] = $hit;
  36. }
  37. return $hitList2;
  38. }
  39. private function getMax()
  40. {
  41. $redis = Rds::factory();
  42. $memKey = 'blackWord_max';
  43. $max = $redis->get($memKey);
  44. if($max===false)
  45. {
  46. $max = 0;
  47. $blackWord = new Model_BlackWord_BlackWord();
  48. $para['field'] = "MAX(id) AS max";
  49. $result = $blackWord->search($para);
  50. if(isset($result[0]['max'])) $max = $result[0]['max'];
  51. $redis->setex($memKey,300,$max);
  52. }
  53. return $max;
  54. }
  55. /**
  56. * 分批过滤得到禁词
  57. * @param unknown $txt
  58. * @param number $page
  59. * @param number $size
  60. * @return multitype:Ambigous
  61. */
  62. private function getHitListByPage($txt,$page=1,$size=1000)
  63. {
  64. $hitList = array();
  65. //分批得到禁词树
  66. $wordTree = $this->getWordTreeByPage($page,$size);
  67. $txt = strip_tags($txt);
  68. $txt = preg_replace('/[^a-zA-Z0-9\\x{4e00}-\\x{9fa5}]/iu','',$txt);
  69. $len = mb_strlen($txt,'UTF-8');
  70. for($i=0;$i<$len;$i++)
  71. {
  72. $char = mb_substr($txt,$i,1,'UTF-8');
  73. if(isset($wordTree[$char]))
  74. {
  75. $result = $this->getHitListByTree(mb_substr($txt,$i,50,'UTF-8'),$wordTree);
  76. if($result)
  77. {
  78. foreach($result as $hit=>$type)
  79. {
  80. $hitList[$hit] = $type;
  81. }
  82. }
  83. }
  84. }
  85. return $hitList;
  86. }
  87. /**
  88. * 是否禁词
  89. * @param str $txt
  90. * @param arr $wordTree
  91. * @return multitype:unknown
  92. */
  93. private function getHitListByTree($txt,&$wordTree)
  94. {
  95. $len = mb_strlen($txt,'UTF-8');
  96. $point = & $wordTree;
  97. $hit = '';
  98. $hitList = array();
  99. for($i=0;$i<$len;$i++)
  100. {
  101. $char = mb_substr($txt,$i,1,'UTF-8');
  102. if(isset($point[$char]))
  103. {
  104. $hit .= $char;
  105. $point = & $point[$char];
  106. if(isset($point['type']))//匹配成功
  107. {
  108. $hitList[$hit] = $point['type'];
  109. }
  110. }
  111. else
  112. {
  113. break;
  114. }
  115. }
  116. return $hitList;
  117. }
  118. /**
  119. * 分批得到禁词树
  120. * @param int $page
  121. * @param int $size
  122. * @return arr:
  123. */
  124. private function getWordTreeByPage($page=1,$size=1000)
  125. {
  126. $redis = Rds::factory();
  127. $memKey = 'blackWord_tree_'.$page.'_'.$size;
  128. $wordTree = $redis->get($memKey);
  129. if($wordTree===false)
  130. {
  131. $wordTree = array();
  132. $blackWord = new Model_BlackWord_BlackWord();
  133. $start = ($page-1)*$size;
  134. $end = $start + $size;
  135. $para['where'] = "status=1 AND id>".$start." AND id<=".$end;
  136. $result = $blackWord->search($para);
  137. if($result)
  138. {
  139. foreach($result as $value)
  140. {
  141. if($value['word'])
  142. {
  143. $value['word'] = preg_split('/(? $point = & $wordTree;
  144. foreach($value['word'] as $char)
  145. {
  146. $point = & $point[$char];
  147. }
  148. $point['type'] = $value['type'];
  149. }
  150. }
  151. }
  152. $redis->setex($memKey,300,$wordTree);
  153. }
  154. return $wordTree;
  155. }
  156. }

PHP

人气教程排行