当前位置:Gxlcms > PHP教程 > 权重计算,稍加修改亦可用于分词,词频统计,全文和spam检测等

权重计算,稍加修改亦可用于分词,词频统计,全文和spam检测等

时间:2021-07-01 10:21:17 帮助过:10人阅读

效率非常客观,你要是改成其他用处那效率我就不保证了
  1. /* vim: set expandtab tabstop=4 shiftwidth=4: */
  2. // +------------------------------------------------------------------------
  3. // Name : 权重计算
  4. // Description: 稍加修改,亦可用于分词,词频统计,全文检索和垃圾检测
  5. // Date : 2013/12/16 08:51
  6. // Authors : latel
  7. // +------------------------------------------------------------------------
  8. //
  9. /*外部调用示例*/
  10. /*
  11. $aItems = array(
  12. 'chinaisbig',
  13. 'whichisnot',
  14. 'totalyrightforme',
  15. );
  16. $aTable = array(
  17. 'china,is|small',
  18. 'china,big|me',
  19. 'china,is|big,which|not,me',
  20. 'totaly|right,for,me',
  21. );
  22. $oWeight = new ttrie;
  23. $oWeight->newItems($aItems);
  24. $aResult = $oWeight->newTable($aTable);
  25. */
  26. class weight {
  27. protected $aDict = array(array());
  28. protected $aItems = array();
  29. protected $sLastRule;
  30. protected $aMatchs = array();
  31. protected $aShow = array();
  32. private function init() {
  33. //清空记录的匹配表和输出结果
  34. unset($this->aShow);
  35. }
  36. public function newItems($mItems) {
  37. //导入新的项目
  38. $this->aItems = (is_array($mItems))? $mItems: array($mItems);
  39. $this->init();
  40. }
  41. public function newTable(array $aTable) {
  42. //导入新的对照表,并生成字典
  43. foreach($aTable as $iTableKey=>$sTableLine) {
  44. $aTableLine = explode(',', str_replace('|', ',', $sTableLine));
  45. $setter = function($v, $k, $paraMeter) {
  46. $k1 = $paraMeter[0]; $oWeight = $paraMeter[1];
  47. $oWeight->genDict($v, $k1);
  48. };
  49. array_walk($aTableLine, $setter, array($iTableKey, $this));
  50. }
  51. $this->init();
  52. }
  53. public function getShow($sRule = 'max') {
  54. //获取最终的显示结果
  55. if(empty($this->aItems) || empty($this->aDict))
  56. return array();
  57. if (empty($this->aShow) || $sRule != $this->sLastRule)
  58. return $this->genShow($sRule);
  59. return $this->aShow;
  60. }
  61. public function genShow($sRule) {
  62. $aShow = array();
  63. $aMatchs = array();
  64. $getter = function($v, $k, $oWeight) use(&$aShow, &$aMatchs, $sRule) {
  65. $t = array_count_values($oWeight->matchWord($v));
  66. $aMatchs[] = $t;
  67. switch ($sRule) {
  68. case 'max':
  69. $aShow[$k] = array_keys($t, max($t));
  70. break;
  71. }
  72. };
  73. array_walk($this->aItems, $getter, $this);
  74. $this->aShow = $aShow;
  75. $this->aMatchs = $aMatchs;
  76. return $aShow;
  77. }
  78. private function genDict($mWord, $iKey = '') {
  79. $iInsertPonit = count($this->aDict);
  80. $iCur = 0; //当前节点号
  81. foreach (str_split($mWord) as $iChar) {
  82. if (isset($this->aDict[$iCur][$iChar])) {
  83. $iCur = $this->aDict[$iCur][$iChar];
  84. continue;
  85. }
  86. $this->aDict[$iInsertPonit] = array();
  87. $this->aDict[$iCur][$iChar] = $iInsertPonit;
  88. $iCur = $iInsertPonit;
  89. $iInsertPonit++;
  90. }
  91. $this->aDict[$iCur]['acc'][] = $iKey;
  92. }
  93. function matchWord($sLine) {
  94. $iCur = $iOffset = $iPosition = 0;
  95. $sLine .= "\0";
  96. $iLen = strlen($sLine);
  97. $aReturn = array();
  98. while($iOffset < $iLen) {
  99. $sChar = $sLine{$iOffset};
  100. if(isset($this->aDict[$iCur][$sChar])) {
  101. $iCur = $this->aDict[$iCur][$sChar];
  102. if(isset($this->aDict[$iCur]['acc'])) {
  103. $aReturn = array_merge($aReturn, $this->aDict[$iCur]['acc']);
  104. $iPosition = $iOffset + 1;
  105. $iCur = 0;
  106. }
  107. } else {
  108. $iCur = 0;
  109. $iOffset = $iPosition;
  110. $iPosition = $iOffset + 1;
  111. }
  112. ++$iOffset;
  113. }
  114. return $aReturn;
  115. }
  116. }
  117. ?>

人气教程排行