通过正则表达式提取网页

转帖地址:http://www.cnblogs.com/gaowenbin/articles/1777421.html

1,先看需求吧,我们需要设计一个文本框,然后输入网址后从该网址上下载HTML源码,设想想,我们可以通过wenclinet进行处理吧

privatestringGetWebPage(stringuri)

{

try

{

HttpWebRequesthttprep=(HttpWebRequest)WebRequest.Create(uri);//创建HTTP请求

HttpWebResponseres=httprep.GetResponse()asHttpWebResponse;//返回响应的值

EncodingdefaultEncoding=Encoding.UTF8;//设置初步编码

stringstrType=string.Empty;//判断类型

stringstrReturnedEncoding=string.Empty;//编码

strType=res.ContentType;//获取响应内容值

if(strType.IndexOf(";")>-1)//判断响应内容类型中有无;

{

strType=strType.Split(newchar[]{';'})[0].Trim().ToLower();//存在分号进行分割

}

if(strType!="text/html")//返回类型不为text/html直接返回空值

{

returnstring.Empty;

}

if(res.ContentType.ToLower().Replace("",string.Empty).IndexOf("charset")>-1)//先判断编码Content-Type中有无charset

{

strReturnedEncoding=res.ContentType.ToLower().Replace("",string.Empty).Substring(res.ContentType.ToLower().Replace("",string.Empty).IndexOf("charset=")+;//取出charset=后的内容

if(strReturnedEncoding!=string.Empty)

{

defaultEncoding=Encoding.GetEncoding(strReturnedEncoding);//设置取出的编码

}

}

if(strReturnedEncoding==string.Empty)//获取响应的编码方法

{

strReturnedEncoding=res.ContentEncoding;//获取web请求的编码

if(strReturnedEncoding!=string.Empty&&strReturnedEncoding!="gzip")//编码响应方法不为空设置编码切不为gzip

{

defaultEncoding=Encoding.GetEncoding(strReturnedEncoding);//设置取出的编码

}

}

if(strReturnedEncoding==string.Empty)//获取响应的字符

{

strReturnedEncoding=res.CharacterSet;

if(strReturnedEncoding!=string.Empty)//响应字符不为空设置编码

{

defaultEncoding=Encoding.GetEncoding(strReturnedEncoding);

}

}

httprep.Abort();//取消Internet资源请求

res.Close();//关闭响应流

stringstrHtml=string.Empty;

WebClientwc=newWebClient();

byte[]btHtml=wc.DownloadData(uri);

strHtml=Encoding.GetEncoding(defaultEncoding.WebName).GetString(btHtml);//以Url形式请求资源获取Internet编号管理机构注册的当前编码

wc.Dispose();//释放所有资源

Regexreg_charset=newRegex(@"charset\b\s*=\s*(?[^""|^'']*)");//正则匹配获取编码(charset=‘Value’)

if(reg_charset.IsMatch(strHtml))//在网页源码中提取编码

{

strReturnedEncoding=reg_charset.Match(strHtml).Groups["charset"].Value;//取出网页真实编码

}

if(strReturnedEncoding!=string.Empty&&Encoding.GetEncoding(strReturnedEncoding)!=defaultEncoding)

{

strHtml=Encoding.GetEncoding(strReturnedEncoding).GetString(btHtml);//设置真正的编码进行源码下载

}

returnstrHtml;

}

catch

{

returnstring.Empty;

}

}

2,当提取出HTml源码后,需要的就是对源码进行处理

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("

","");

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("","");

Html=Html.Replace("","");

3,由于我们需要在TreeView中完整的将网页的数据显示出来,首先在页面上创建TreeView,然后创建节点

privateTreeNodepopulateTagNode(stringinputString,stringTitles)

{

TreeNodehtmlTagNode=newTreeNode();

try

{

MatchCollectionmatchesFound;

TreeNodehtmlSubTagNode;

stringsTag;

//通过正则表达式提取HTML

matchesFound=regex.Matches(inputString);

htmlTagNode.Text=Titles;

foreach(MatchmatchMadeinmatchesFound)

{

intMathchesMade=intMathchesMade+1;

sTag="";

htmlSubTagNode=populateTagNode(matchMade.Groups[3].Value,sTag);

htmlTagNode.Nodes.Add(htmlSubTagNode);

}

}

catch(Exceptionex)

{

MessageBox.Show("Error:"+ex.Message.ToString());

}

returnhtmlTagNode;

}

相关推荐