最近在做一個分享功能,根據使用者輸入的網站 自動抓取連結中的文本及圖檔内容,根據自己的需求和思路
整理了一個通用的類,下次使用的時候直接可以調用,也分享在這裡,希望能給用的着的朋友帶來友善。
網頁抓取類
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;
/// <summary>
///Name:網頁抓取類
///Author:loafinweb
///Date:2011-09-12
/// </summary>
public class webCrawl
{
public webCrawl() { }
//擷取網頁字元根據url
public static string getHtml(string url)
{
try
{
string str = "";
Encoding en = Encoding.GetEncoding(getEncoding(url));
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Headers.Set("Pragma", "no-cache");
request.Timeout = 30000;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
Stream strM = response.GetResponseStream();
StreamReader sr = new StreamReader(strM, en);
str = sr.ReadToEnd();
strM.Close();
sr.Close();
}
return str;
}
catch
{
return String.Empty;
}
}
//擷取編碼
public static string getEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 30000;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
string html = reader.ReadToEnd();
Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
finally
{
if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close();
if (request != null)
request = null;
}
return Encoding.Default.BodyName;
}
//根據内容--擷取标題
public static string getTitle(string url)
{
string title = string.Empty;
string htmlStr = getHtml(url);//擷取網頁
Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
title = TitleMatch.Groups[1].Value;
title = Regex.Replace(title, @"\W", "");//去除空格
return title;
}
//根據内容--擷取描述資訊
public static string getDescription(string url)
{
string htmlStr = getHtml(url);
Match Desc = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string mdd = Desc.Groups[1].Value;
return Regex.Replace(Desc.Groups[1].Value, @"\W", "");
}
//根據内容--擷取所有連結
public static List<string> getLink(string htmlStr)
{
List<string> list = new List<string>(); //用來存放連結
String reg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; //連結的正規表達式
Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
MatchCollection mc = regex.Matches(htmlStr);
for (int i = 0; i < mc.Count; i++) //存放比對的集合
{
bool hasExist = false; //連結存在與否的标記
String name = mc[i].ToString();
foreach (String one in list)
{
if (name == one)
{
hasExist = true; //連結已存在
break;
}
}
if (!hasExist) list.Add(name); //連結不存在,添加
}
return list;
}
//根據内容--取得body内的内容
public static string getBody(string url)
{
string htmlStr = getHtml(url);
string result = string.Empty;
Regex regBody = new Regex(@"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>");
Match m = regBody.Match(htmlStr);
if (m.Success)
{
result = parseHtml(m.Value);
}
return result;
}
//擷取所有圖檔
public static List<string> getImg(string url)
{
List<string> list = new List<string>();
string temp = string.Empty;
string htmlStr = getHtml(url);
MatchCollection matchs = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>"); //抽取所有圖檔
for (int i = 0; i < matchs.Count; i++)
{
list.Add(matchs[i].Value);
}
return list;
}
//所有圖檔路徑(如果是相對路徑的話,自動設定成絕對路徑)
public static List<string> getImgPath(string url)
{
List<string> list = new List<string>();
string htmlStr = getHtml(url);
string pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";
MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
foreach (Match m in matches)
{
string imgPath = m.Groups["imgUrl"].Value.Trim();
if (Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$")) //用了2次比對,去除連結是網頁的 隻留圖檔
{
if (!imgPath.Contains("http"))//必須包含http 否則無法下載下傳
{
imgPath = getUrl(url) + imgPath;
}
list.Add(imgPath);
}
}
return list;
}
//下載下傳圖檔
public void DownloadImg(string fileurl)
{
if (fileurl.Contains('.'))//url路徑必須是絕對路徑 例如http://xxx.com/img/logo.jpg
{
string imgName = DateTime.Now.ToString("yyyyMMddHHmmssffff") + fileurl.Substring(fileurl.LastIndexOf('.')); // 生成圖檔的名字
string filepath = System.Web.HttpContext.Current.Server.MapPath("") + "/" + imgName;
WebClient mywebclient = new WebClient();
mywebclient.DownloadFile(fileurl, filepath);
}
}
//過濾html
public static string parseHtml(string html)
{
string value = Regex.Replace(html, "<[^>]*>", string.Empty);
value = value.Replace("<", string.Empty);
value = value.Replace(">", string.Empty);
//return value.Replace(" ", string.Empty);
return Regex.Replace(value, @"\s+", "");
}
//處理url路徑問題
public static string getUrl(string url)
{
//如果是http://www.xxx.com 傳回http://www.xxx.com/
//如果是http://www.xxx.com/art.aspx 傳回http://www.xxx.com/
return url = url.Substring(0, url.LastIndexOf('/')) + "/";
}
}