网页内容，图片及连接抓取通用类

最近在做一个分享功能，根据用户输入的网站自动抓取链接中的文本及图片内容，根据自己的需求和思路

整理了一个通用的类，下次使用的时候直接可以调用，也分享在这里，希望能给用的着的朋友带来方便。

网页抓取类

using System;

using System.Collections.Generic;

using System.Linq;

using System.Web;

using System.Text;

using System.Net;

using System.IO;

using System.Text.RegularExpressions;

using System.Collections;

using System.IO.Compression;

/// <summary>

///Name:网页抓取类

///Author:loafinweb

///Date:2011-09-12

/// </summary>

public class webCrawl

{

public webCrawl() { }

//获取网页字符根据url

public static string getHtml(string url)

{

try

{

string str = "";

Encoding en = Encoding.GetEncoding(getEncoding(url));

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

request.Headers.Set("Pragma", "no-cache");

request.Timeout = 30000;

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)

{

Stream strM = response.GetResponseStream();

StreamReader sr = new StreamReader(strM, en);

str = sr.ReadToEnd();

strM.Close();

sr.Close();

}

return str;

}

catch

{

return String.Empty;

}

//获取编码

public static string getEncoding(string url)

{

HttpWebRequest request = null;

HttpWebResponse response = null;

StreamReader reader = null;

try

{

request = (HttpWebRequest)WebRequest.Create(url);

request.Timeout = 30000;

request.AllowAutoRedirect = false;

response = (HttpWebResponse)request.GetResponse();

if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)

{

if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))

reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));

else

reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);

string html = reader.ReadToEnd();

Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");

if (reg_charset.IsMatch(html))

{

return reg_charset.Match(html).Groups["charset"].Value;

}

else if (response.CharacterSet != string.Empty)

{

return response.CharacterSet;

}

else

return Encoding.Default.BodyName;

}

catch (Exception ex)

{

throw new Exception(ex.Message);

}

finally

{

if (response != null)

{

response.Close();

response = null;

}

if (reader != null)

reader.Close();

if (request != null)

request = null;

}

return Encoding.Default.BodyName;

}

//根据内容--获取标题

public static string getTitle(string url)

{

string title = string.Empty;

string htmlStr = getHtml(url);//获取网页

Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

title = TitleMatch.Groups[1].Value;

title = Regex.Replace(title, @"\W", "");//去除空格

return title;

}

//根据内容--获取描述信息

public static string getDescription(string url)

{

string htmlStr = getHtml(url);

Match Desc = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

string mdd = Desc.Groups[1].Value;

return Regex.Replace(Desc.Groups[1].Value, @"\W", "");

}

//根据内容--获取所有链接

public static List<string> getLink(string htmlStr)

{

List<string> list = new List<string>(); //用来存放链接

String reg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; //链接的正则表达式

Regex regex = new Regex(reg, RegexOptions.IgnoreCase);

MatchCollection mc = regex.Matches(htmlStr);

for (int i = 0; i < mc.Count; i++) //存放匹配的集合

{

bool hasExist = false; //链接存在与否的标记

String name = mc[i].ToString();

foreach (String one in list)

{

if (name == one)

{

hasExist = true; //链接已存在

break;

}

if (!hasExist) list.Add(name); //链接不存在，添加

}

return list;

}

//根据内容--取得body内的内容

public static string getBody(string url)

{

string htmlStr = getHtml(url);

string result = string.Empty;

Regex regBody = new Regex(@"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>");

Match m = regBody.Match(htmlStr);

if (m.Success)

{

result = parseHtml(m.Value);

}

return result;

}

//获取所有图片

public static List<string> getImg(string url)

{

List<string> list = new List<string>();

string temp = string.Empty;

string htmlStr = getHtml(url);

MatchCollection matchs = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>"); //抽取所有图片

for (int i = 0; i < matchs.Count; i++)

{

list.Add(matchs[i].Value);

}

return list;

}

//所有图片路径(如果是相对路径的话，自动设置成绝对路径)

public static List<string> getImgPath(string url)

{

List<string> list = new List<string>();

string htmlStr = getHtml(url);

string pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";

MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);

foreach (Match m in matches)

{

string imgPath = m.Groups["imgUrl"].Value.Trim();

if (Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$")) //用了2次匹配，去除链接是网页的只留图片

{

if (!imgPath.Contains("http"))//必须包含http 否则无法下载

{

imgPath = getUrl(url) + imgPath;

}

list.Add(imgPath);

}

return list;

}

//下载图片

public void DownloadImg(string fileurl)

{

if (fileurl.Contains('.'))//url路径必须是绝对路径例如http://xxx.com/img/logo.jpg

{

string imgName = DateTime.Now.ToString("yyyyMMddHHmmssffff") + fileurl.Substring(fileurl.LastIndexOf('.')); // 生成图片的名字

string filepath = System.Web.HttpContext.Current.Server.MapPath("") + "/" + imgName;

WebClient mywebclient = new WebClient();

mywebclient.DownloadFile(fileurl, filepath);

}

//过滤html

public static string parseHtml(string html)

{

string value = Regex.Replace(html, "<[^>]*>", string.Empty);

value = value.Replace("<", string.Empty);

value = value.Replace(">", string.Empty);

//return value.Replace(" ", string.Empty);

return Regex.Replace(value, @"\s+", "");

}

//处理url路径问题

public static string getUrl(string url)

{

//如果是http://www.xxx.com 返回http://www.xxx.com/

//如果是http://www.xxx.com/art.aspx 返回http://www.xxx.com/

return url = url.Substring(0, url.LastIndexOf('/')) + "/";

}

网页内容，图片及连接 抓取通用类

网页内容，图片及连接抓取通用类