当前位置:Gxlcms > asp.net > 使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

时间:2021-07-01 10:21:17 帮助过:87人阅读


Web 前端代码
代码如下:

  1. <br><%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %> <br><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <br><html xmlns="http://www.w3.org/1999/xhtml"> <br><head runat="server"> <br><title></title> <br></head> <br><body> <br><form id="form1" runat="server"> <br><div> <br><table cellpadding="1" cellspacing="1" bgcolor="#f1f1f1" style="text-align: center"> <br><asp:Repeater ID="Repeater1" runat="server"> <br><HeaderTemplate> <br><tr> <br><td> <br>标题 <br></td> <br><td> <br>发布作者 <br></td> <br><td> <br>发布时间 <br></td> <br></tr> <br></HeaderTemplate> <br><ItemTemplate> <br><tr bgcolor="#ffffff"> <br><td align="left"> <br><a href='<%#Eval("url") %>' target="_blank"> <br><%#Eval("title") %> <br></a> <br></td> <br><td> <br><a href='<%#Eval("authorUrl") %>' target="_blank"> <br><%#Eval("author") %> <br></a> <br></td> <br><td> <br><%#Eval("updatetime") %> <br></td> <br></tr> <br></ItemTemplate> <br></asp:Repeater> <br></table> <br></div> <br></form> <br></body> <br></html> <br> <br>cs 后台代码: <br><span><u></u></span> 代码如下:<pre class="brush:php;toolbar:false layui-box layui-code-view layui-code-notepad"><ol class="layui-code-ol"><li><br>using System; <br>using System.Collections.Generic; <br>using System.Linq; <br>using System.Web; <br>using System.Web.UI; <br>using System.Web.UI.WebControls; <br>using S1; <br>using System.Net; <br>using System.IO; <br>using System.Text; <br>using HtmlAgilityPack; <br>public partial class _Default : System.Web.UI.Page <br>{ <br>protected void Page_Load(object sender, EventArgs e) <br>{ <br>string page = string.Empty; <br>if (!IsPostBack) <br>{ <br>WebClient wc = new WebClient(); <br>string address = "http://www.cnblogs.com"; <br>if (!string.IsNullOrEmpty(Request.QueryString["p"])) <br>{ <br>address += "/" + Request.QueryString["p"];//分页,p=p2,p=p3 <br>} <br>Stream stream = wc.OpenRead(address); <br>StreamReader sr = new StreamReader(stream, Encoding.UTF8); <br>string html = sr.ReadToEnd(); <br>//实例化HtmlAgilityPack.HtmlDocument对象 <br>HtmlDocument doc = new HtmlDocument(); <br>//载入HTML <br>doc.LoadHtml(html); <br>//根据HTML节点NODE的ID获取节点 <br>HtmlNode navNode = doc.GetElementbyId("post_list"); <br>//div[2]表示文章链接a位于post_list里面第3个div节点中 <br>HtmlNodeCollection list = navNode.SelectNodes("//div[2]/h3/a"); //根据XPATH来索引节点 <br>Cnblogs cnblogs = null; <br>IList<Cnblogs> cnlist = new List<Cnblogs>(); <br>foreach (HtmlNode node in list) <br>{ <br>cnblogs = new Cnblogs(); <br>//获取文章链接地址 <br>cnblogs.url = node.Attributes["href"].Value.ToString(); <br>//获取文章标题 <br>cnblogs.title = node.InnerText; <br>cnlist.Add(cnblogs); <br>} <br>HtmlNodeCollection list1 = navNode.SelectNodes("//div[2]/div/a"); <br>for (int i = 0; i < cnlist.Count; i++) <br>{ <br>cnlist[i].author = list1[i].InnerText; <br>cnlist[i].authorUrl = list1[i].Attributes["href"].Value.ToString(); <br>cnlist[i].updatetime = list1[i].NextSibling.InnerText.Replace("发布于", "").Trim(); <br>} <br>this.Repeater1.DataSource = cnlist; <br>this.Repeater1.DataBind(); <br>} <br>} <br>public class Cnblogs <br>{ <br>public string title { get; set; } <br>public string url { get; set; } <br>public string author { get; set; } <br>public string authorUrl { get; set; } <br>public string updatetime { get; set; } <br>} <br>} <br> <p></p></li><li> </li><li> </li></ol></pre>

人气教程排行