当前位置:Gxlcms > PHP教程 > 解析HTML标签,并实现快速查找节点,获取节点信息

解析HTML标签,并实现快速查找节点,获取节点信息

时间:2021-07-01 10:21:17 帮助过:25人阅读

详细介绍和使用请点击源码出处。
  1. /**
  2. * html标签解析包
  3. *
  4. * @category TagParse
  5. * @package TagParse
  6. * @author kun
  7. * @copyright 2014 kun
  8. * @license http://www.php.com/license/3_01.txt PHP License 3.01
  9. * @version 1.0
  10. * @link http://www.blogkun.com
  11. * @since 1.0
  12. */
  13. namespace TagParse;
  14. /**
  15. * TagDomRoot
  16. *
  17. * @category TagParse
  18. * @package TagParse
  19. * @author kun
  20. * @copyright 2014 kun
  21. * @license http://www.php.com/license/3_01.txt PHP License 3.01
  22. * @version 1.0
  23. * @link http://www.blogkun.com
  24. * @since 1.0
  25. */
  26. class TagDomRoot
  27. {
  28. public $tag = 'root';
  29. public $plaintext;
  30. public $child = array();
  31. public $level = 0;
  32. public static $TagParseError = false;
  33. protected static $TagSet = array();
  34. protected static $FoundNode = array();
  35. public static $ErrorTag = array();
  36. /**
  37. * initProperty
  38. *
  39. * @access public
  40. *
  41. * @return null
  42. */
  43. public function initProperty()
  44. {
  45. $TagParseError = false;
  46. $TagSet = array();
  47. $FoundNode = array();
  48. $DumpScriptCode = array();
  49. $ErrorTag = array();
  50. }
  51. /**
  52. * __construct
  53. *
  54. * @param string $str The tag string to be parse.
  55. *
  56. * @access public
  57. *
  58. * @return TagDomRoot
  59. */
  60. public function __construct($str)
  61. {
  62. $this->_removeNoise($str);
  63. if ($str === null) {
  64. self::$TagParseError = true;
  65. } else {
  66. $l = strpos($str, '<');
  67. if ($l !== false) {
  68. $this->plaintext = substr($str, 0, $l);
  69. }
  70. $res = preg_match_all('~>(.*?)<~s', $str, $matches);
  71. if ($res !== false && $res > 0) {
  72. $this->plaintext .= implode($matches[1]);
  73. }
  74. $r = strrpos($str, '>');
  75. if ($r !== false) {
  76. $this->plaintext .= substr($str, $r+1);
  77. }
  78. $tagCollect = array();
  79. $attrCollect = array();
  80. $innerContentCollect = array();
  81. if ($this->parseTag($str, $tagCollect, $attrCollect, $innerContentCollect) === false) {
  82. self::$TagParseError = true;
  83. }
  84. foreach ($tagCollect as $index => $tag) {
  85. $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);
  86. }
  87. }
  88. }
  89. /**
  90. * parseTag
  91. *
  92. * @param mixed $str Description.
  93. * @param mixed &$tagCollect Description.
  94. * @param mixed &$attrCollect Description.
  95. * @param mixed &$innerContentCollect Description.
  96. *
  97. * @access protected
  98. *
  99. * @return boolean Value.
  100. */
  101. protected function parseTag($str, array &$tagCollect, array &$attrCollect, array &$innerContentCollect)
  102. {
  103. $selfClosingTags = array('img' => 1, 'br' => 1, 'input' => 1, 'meta' => 1, 'link' => 1, 'hr' => 1, 'base' => 1, 'embed' => 1, 'spacer' => 1);
  104. $end = -2;
  105. $close = 0;
  106. $error = false;
  107. $tag = '';
  108. while (true) {
  109. $l = strpos($str, '<', $end+strlen($tag)+2);
  110. if ($l === false) {//parse end
  111. break;
  112. }
  113. if (strpos(substr($str, $l, 2), '/') !== false) {//surplus closing tag,discard
  114. $error = true;
  115. $end = $l+strlen($tag);
  116. self::$ErrorTag[] = substr($str, $l, strpos($str, '>', $l)-$l+1);
  117. continue;
  118. }
  119. $r = strpos($str, '>', $l);
  120. $tag = substr($str, $l+1, $r-$l-1);
  121. if (!ctype_alpha($tag[0]) || strpos($tag, '<') !== false) {
  122. $end = $r + 1;
  123. continue;
  124. }
  125. $tag = preg_replace("~\n+~", ' ', $tag);
  126. $space = strpos($tag, ' ');
  127. if ($space !== false) {
  128. $attrCollect[] = substr($tag, $space+1);
  129. $tag = substr($tag, 0, $space);
  130. } else {
  131. $attrCollect[] = '';
  132. }
  133. $tagCollect[] = $tag;
  134. if (isset($selfClosingTags[$tag])) {
  135. $innerContentCollect[] = '';
  136. $end = $r-strlen($tag)-2;
  137. $close = $r+1;
  138. continue;
  139. }
  140. $countOpen = -1;
  141. $open = strpos($str, '<'.$tag, $close);
  142. $close = strpos($str, '', $open);
  143. if ($close === false) {//surplus opening tag
  144. $innerContentCollect[] = substr($str, $r+1);
  145. $error = true;
  146. self::$ErrorTag[] = '<'.$tag.'>';
  147. break;
  148. }
  149. $start = $open;
  150. while ($open < $close && $open !== false) {
  151. $countOpen++;
  152. $open = strpos($str, '<'.$tag, $open+strlen($tag));
  153. }
  154. while ($countOpen > 0 && $close !== false) {
  155. $open = strpos($str, '<'.$tag, $close+strlen($tag)+3);
  156. $close = strpos($str, '', $close+strlen($tag)+3);
  157. if ($close === false) {
  158. break;
  159. }
  160. $countOpen--;
  161. while ($open < $close && $open !== false) {
  162. $open = strpos($str, '<'.$tag, $open+strlen($tag)+3);
  163. $countOpen++;
  164. }
  165. }
  166. if ($close === false) {//标签闭合不配对
  167. $innerContentCollect[] = substr($str, $r+1);
  168. $error = true;
  169. break;
  170. }
  171. $end = $close;
  172. $r = strpos($str, '>', $start);
  173. $innerContentCollect[] = substr($str, $r+1, $end - $r - 1);
  174. }
  175. return !$error;
  176. }
  177. /**
  178. * _removeNoise
  179. *
  180. * @param string &$str The tag string to be parse.
  181. *
  182. * @access private
  183. *
  184. * @return string
  185. */
  186. private function _removeNoise(&$str)
  187. {
  188. $str = preg_replace('~~is', '', $str);
  189. $str = preg_replace('~~is', '', $str);
  190. $str = preg_replace('~~is', '', $str);
  191. }
  192. /**
  193. * parseSelectors
  194. *
  195. * @param string $selectors user's select condition.
  196. * @param array &$selectorsTag tags
  197. * @param array &$selectorsAttr attributes
  198. *
  199. * @access protected
  200. *
  201. * @return null
  202. */
  203. protected function parseSelectors($selectors, array &$selectorsTag, array &$selectorsAttr)
  204. {
  205. preg_match_all('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selectors, $matches);
  206. $selectorsTag = $matches[1];
  207. foreach ($matches[2] as $key => $value) {
  208. $selectorsAttr[$key] = array();
  209. if ($value !== '') {
  210. preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $value, $matches);
  211. foreach ($matches[1] as $index => $attr) {
  212. $selectorsAttr[$key][$attr] = $matches[2][$index];
  213. }
  214. }
  215. }
  216. }
  217. /**
  218. * find
  219. *
  220. * @param mixed $selectors user's select condition.
  221. * @param array $selectorsTag tags.
  222. * @param array $selectorsAttr attributes.
  223. *
  224. * @access public
  225. *
  226. * @return array
  227. */
  228. public function find($selectors, $selectorsTag = array(), $selectorsAttr = array())
  229. {
  230. if ($selectors !== null) {
  231. $this->parseSelectors($selectors, $selectorsTag, $selectorsAttr);
  232. }
  233. var_dump($selectorsTag, $selectorsAttr);exit();
  234. if (!empty($selectorsTag)) {
  235. $this->seek($selectorsTag, $selectorsAttr);
  236. foreach ($this->child as $key => $node) {
  237. $node->find(null, $selectorsTag, $selectorsAttr);
  238. }
  239. }
  240. if ($selectors !== null) {
  241. $res = self::$FoundNode;
  242. self::$FoundNode = array();
  243. return $res;
  244. }
  245. }
  246. /**
  247. * findGlobal
  248. *
  249. * @param string $selectors user's select condition.
  250. *
  251. * @access public
  252. *
  253. * @return array
  254. */
  255. public function findGlobal($selectors)
  256. {
  257. $space = strpos($selectors, ' ', strpos($selectors, ']'));
  258. if ($space === false) {
  259. return $this->findOneGlobal($selectors);
  260. } else {
  261. $selectorsAttr = array();
  262. $selectorsTag = array();
  263. $this->findOneGlobal(substr($selectors, 0, $space), false);
  264. $this->parseSelectors(substr($selectors, $space + 1), $selectorsTag, $selectorsAttr);
  265. if (!empty(self::$FoundNode) && !empty($selectorsTag)) {
  266. $nodes = self::$FoundNode;
  267. self::$FoundNode = array();
  268. foreach ($nodes as $key => $node) {
  269. $node->seek($selectorsTag, $selectorsAttr);
  270. }
  271. }
  272. }
  273. $res = self::$FoundNode;
  274. self::$FoundNode = array();
  275. return $res;
  276. }
  277. /**
  278. * seek
  279. *
  280. * @param array $selectorsTag tags.
  281. * @param array $selectorsAttr attributes.
  282. *
  283. * @access protected
  284. *
  285. * @return null
  286. */
  287. protected function seek($selectorsTag, $selectorsAttr)
  288. {
  289. foreach ($this->child as $key => $node) {
  290. $isFind = true;
  291. if ($node->tag === $selectorsTag[0]) {
  292. foreach ($selectorsAttr[0] as $attrName => $value) {
  293. if (isset($node->attr[$attrName])
  294. && (preg_match('~.*? '.$value.' .*?~', $node->attr[$attrName]) > 0
  295. || preg_match('~^'.$value.'$~', $node->attr[$attrName]) > 0
  296. || preg_match('~^'.$value.' ~', $node->attr[$attrName]) > 0
  297. || preg_match('~ '.$value.'$~', $node->attr[$attrName]) > 0)
  298. ) {
  299. continue;
  300. } else {
  301. $isFind = false;
  302. break;
  303. }
  304. }
  305. } else {
  306. $isFind = false;
  307. }
  308. if ($isFind) {
  309. if (count($selectorsTag) === 1) {
  310. self::$FoundNode[] = $node;
  311. } else {
  312. $node->seek(
  313. array_slice($selectorsTag, 1),
  314. array_slice($selectorsAttr, 1)
  315. );
  316. }
  317. }
  318. }
  319. }
  320. /**
  321. * findOneGlobal
  322. *
  323. * @param string $selector user's select condition.
  324. * @param bool $isReturn weather return value.
  325. *
  326. * @access public
  327. *
  328. * @return array
  329. */
  330. public function findOneGlobal($selector, $isReturn = true)
  331. {
  332. preg_match('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selector, $matches);
  333. $tag = $matches[1];
  334. $attr = array();
  335. if (isset($matches[2])) {
  336. preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $matches[2], $matches);
  337. foreach ($matches[1] as $key => $value) {
  338. $attr[$value] = $matches[2][$key];
  339. }
  340. }
  341. if (isset(self::$TagSet[$tag])) {
  342. foreach (self::$TagSet[$tag] as $attrValue => $nodeArray) {
  343. $isFind = true;
  344. foreach ($attr as $attrName => $value) {
  345. if (preg_match('~'.$attrName.'=".*? '.$value.' .*?"~', $attrValue)
  346. || preg_match('~'.$attrName.'="'.$value.' .*?"~', $attrValue)
  347. || preg_match('~'.$attrName.'=".*? '.$value.'"~', $attrValue)
  348. || preg_match('~'.$attrName.'="'.$value.'"~', $attrValue)
  349. ) {
  350. continue;
  351. } else {
  352. $isFind = false;
  353. break;
  354. }
  355. }
  356. if ($isFind) {
  357. foreach ($nodeArray as $key => $node) {
  358. self::$FoundNode[] = $node;
  359. }
  360. }
  361. }
  362. }
  363. if ($isReturn) {
  364. $res = self::$FoundNode;
  365. self::$FoundNode = array();
  366. return $res;
  367. }
  368. }
  369. }
  370. /**
  371. * TagDomNode
  372. *
  373. * @uses TagDomRoot
  374. *
  375. * @category TagParse
  376. * @package TagParse
  377. * @author kun
  378. * @copyright 2014 kun
  379. * @license http://www.php.com/license/3_01.txt PHP License 3.01
  380. * @version 1.0
  381. * @link http://www.blogkun.com
  382. * @since 1.0
  383. */
  384. class TagDomNode extends TagDomRoot
  385. {
  386. public $attr = array();
  387. public $parent = null;
  388. /**
  389. * __construct
  390. *
  391. * @param mixed $tag tag.
  392. * @param mixed $parent parent node.
  393. * @param mixed $attr attribute.
  394. * @param mixed $innerContent tag content.
  395. * @param mixed $level node level.
  396. *
  397. * @access public
  398. *
  399. * @return TagDomNode
  400. */
  401. public function __construct($tag, $parent, $attr, $innerContent, $level)
  402. {
  403. $this->tag = $tag;
  404. $this->parent = $parent;
  405. $this->_parseAttr($attr);
  406. $this->level = $level;
  407. $l = strpos($innerContent, '<');
  408. if ($l !== false) {
  409. $this->plaintext = substr($innerContent, 0, $l);
  410. }
  411. $res = preg_match_all('~>(.*?)<~s', $innerContent, $matches);
  412. if ($res !== false && $res > 0) {
  413. $this->plaintext .= implode($matches[1]);
  414. } else {
  415. $this->plaintext .= $innerContent;
  416. }
  417. $r = strrpos($innerContent, '>');
  418. if ($r !== false) {
  419. $this->plaintext .= substr($innerContent, $r+1);
  420. }
  421. $tagCollect = array();
  422. $attrCollect = array();
  423. $innerContentCollect = array();
  424. if ($this->parseTag($innerContent, $tagCollect, $attrCollect, $innerContentCollect) === false) {
  425. self::$TagParseError = true;
  426. }
  427. foreach ($tagCollect as $index => $tag) {
  428. $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);
  429. }
  430. if (!isset(self::$TagSet[$this->tag])) {
  431. self::$TagSet[$this->tag] = array();
  432. }
  433. if (!isset(self::$TagSet[$this->tag][$attr])) {
  434. self::$TagSet[$this->tag][$attr] = array();
  435. }
  436. self::$TagSet[$this->tag][$attr][] = &$this;
  437. }
  438. /**
  439. * _parseAttr
  440. *
  441. * @param string $str attribute string.
  442. *
  443. * @access public
  444. *
  445. * @return null
  446. */
  447. private function _parseAttr($str)
  448. {
  449. preg_match_all('~(?[\w-]+)="(?.*?)"~s', $str, $matches);
  450. foreach ($matches['attrName'] as $key => $value) {
  451. $this->attr[$value] = $matches['attrValue'][$key];
  452. }
  453. }
  454. }

人气教程排行