当前位置:Gxlcms > asp.net > asp.net 抓取网页源码三种实现方法

asp.net 抓取网页源码三种实现方法

时间:2021-07-01 10:21:17 帮助过:8人阅读

方法1 比较推荐  

  1. /// <summary>
  2. /// 用HttpWebRequest取得网页源码
  3. /// 对于带BOM的网页很有效,不管是什么编码都能正确识别
  4. /// </summary>
  5. /// <param name="url">网页地址" </param>
  6. /// <returns>返回网页源文件</returns>
  7. public static string GetHtmlSource2(string url)
  8. {
  9. //处理内容
  10. string html = "";
  11. HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
  12. request.Accept = "*/*"; //接受任意文件
  13. request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)"; // 模拟使用IE在浏览 http://www.52mvc.com
  14. request.AllowAutoRedirect = true;//是否允许302
  15. //request.CookieContainer = new CookieContainer();//cookie容器,
  16. request.Referer = url; //当前页面的引用
  17. HttpWebResponse response = (HttpWebResponse)request.GetResponse();
  18. Stream stream = response.GetResponseStream();
  19. StreamReader reader = new StreamReader(stream, Encoding.Default);
  20. html = reader.ReadToEnd();
  21. stream.Close();
  22. return html;
  23. }

方法2 

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Web;
  5. using System.IO;
  6. using System.Text;
  7. using System.Net;
  8. namespace MySql
  9. {
  10. public class GetHttpData
  11. {
  12. public static string GetHttpData2(string Url)
  13. {
  14. string sException = null;
  15. string sRslt = null;
  16. WebResponse oWebRps = null;
  17. WebRequest oWebRqst = WebRequest.Create(Url);
  18. oWebRqst.Timeout = 50000;
  19. try
  20. {
  21. oWebRps = oWebRqst.GetResponse();
  22. }
  23. catch (WebException e)
  24. {
  25. sException = e.Message.ToString();
  26. }
  27. catch (Exception e)
  28. {
  29. sException = e.ToString();
  30. }
  31. finally
  32. {
  33. if (oWebRps != null)
  34. {
  35. StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), Encoding.GetEncoding("utf-8"));
  36. sRslt = oStreamRd.ReadToEnd();
  37. oStreamRd.Close();
  38. oWebRps.Close();
  39. }
  40. }
  41. return sRslt;
  42. }
  43. }
  44. }

方法3

  1. public static string getHtml(string url, params string [] charSets)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
  2. {
  3. try
  4. {
  5. string charSet = null;
  6. if (charSets.Length == 1) {
  7. charSet = charSets[0];
  8. }
  9. WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
  10. // 需要注意的:
  11. //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
  12. //这是就要具体问题具体分析比如在头部加入cookie
  13. // webclient.Headers.Add("Cookie", cookie);
  14. //这样可能需要一些重载方法。根据需要写就可以了
  15. //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
  16. myWebClient.Credentials = CredentialCache.DefaultCredentials;
  17. //如果服务器要验证用户名,密码
  18. //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
  19. //myWebClient.Credentials = mycred;
  20. //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
  21. byte[] myDataBuffer = myWebClient.DownloadData(url);
  22. string strWebData = Encoding.Default.GetString(myDataBuffer);
  23. //获取网页字符编码描述信息
  24. Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
  25. string webCharSet = charSetMatch.Groups[2].Value;
  26. if (charSet == null || charSet == "")
  27. charSet = webCharSet;
  28. if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
  29. {
  30. strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
  31. }
  32. else {
  33. strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);
  34. }
  35. return strWebData;
  36. }
  37. catch (Exception e) { return ""; }
  38. }

asp.net 获取网页源文件的方法

有时候我们需要获取 网页源文件,所以用以下这个方法很容易完成任务!

  1. private string GetStringByUrl(string strUrl)
  2. {
  3. WebRequest wrt = WebRequest.Create(strUrl);
  4. WebResponse wrse = wrt.GetResponse();
  5. Stream strM = wrse.GetResponseStream();
  6. StreamReader SR = new StreamReader(strM, Encoding.GetEncoding("gb2312"));
  7. string strallstrm = SR.ReadToEnd();
  8. return strallstrm;
  9. }

只要传入要下载网页的地址就OK了!
通过这个方法做个源码导出:

  1. private string SaveHTML()
  2. {
  3. string str = RenderPage("Default2.aspx");
  4. Response.ContentEncoding = System.Text.Encoding.GetEncoding("UTF-8"); //解决中文乱码
  5. Response.AddHeader("Content-Disposition","attachment;filename=index.html"); //解决中文文件名乱码
  6. Response.AddHeader("Content-length",str.Length.ToString());
  7. Response.Write(str);
  8. Response.End();

以上就是asp.net 抓取网页源码的全部代码了,希望对大家有所帮助。

人气教程排行