通过正则表达式提取网页
转帖地址:http://www.cnblogs.com/gaowenbin/articles/1777421.html
1,先看需求吧,我们需要设计一个文本框,然后输入网址后从该网址上下载HTML源码,设想想,我们可以通过wenclinet进行处理吧
privatestringGetWebPage(stringuri)
{
try
{
HttpWebRequesthttprep=(HttpWebRequest)WebRequest.Create(uri);//创建HTTP请求
HttpWebResponseres=httprep.GetResponse()asHttpWebResponse;//返回响应的值
EncodingdefaultEncoding=Encoding.UTF8;//设置初步编码
stringstrType=string.Empty;//判断类型
stringstrReturnedEncoding=string.Empty;//编码
strType=res.ContentType;//获取响应内容值
if(strType.IndexOf(";")>-1)//判断响应内容类型中有无;
{
strType=strType.Split(newchar[]{';'})[0].Trim().ToLower();//存在分号进行分割
}
if(strType!="text/html")//返回类型不为text/html直接返回空值
{
returnstring.Empty;
}
if(res.ContentType.ToLower().Replace("",string.Empty).IndexOf("charset")>-1)//先判断编码Content-Type中有无charset
{
strReturnedEncoding=res.ContentType.ToLower().Replace("",string.Empty).Substring(res.ContentType.ToLower().Replace("",string.Empty).IndexOf("charset=")+;//取出charset=后的内容
if(strReturnedEncoding!=string.Empty)
{
defaultEncoding=Encoding.GetEncoding(strReturnedEncoding);//设置取出的编码
}
}
if(strReturnedEncoding==string.Empty)//获取响应的编码方法
{
strReturnedEncoding=res.ContentEncoding;//获取web请求的编码
if(strReturnedEncoding!=string.Empty&&strReturnedEncoding!="gzip")//编码响应方法不为空设置编码切不为gzip
{
defaultEncoding=Encoding.GetEncoding(strReturnedEncoding);//设置取出的编码
}
}
if(strReturnedEncoding==string.Empty)//获取响应的字符
{
strReturnedEncoding=res.CharacterSet;
if(strReturnedEncoding!=string.Empty)//响应字符不为空设置编码
{
defaultEncoding=Encoding.GetEncoding(strReturnedEncoding);
}
}
httprep.Abort();//取消Internet资源请求
res.Close();//关闭响应流
stringstrHtml=string.Empty;
WebClientwc=newWebClient();
byte[]btHtml=wc.DownloadData(uri);
strHtml=Encoding.GetEncoding(defaultEncoding.WebName).GetString(btHtml);//以Url形式请求资源获取Internet编号管理机构注册的当前编码
wc.Dispose();//释放所有资源
Regexreg_charset=newRegex(@"charset\b\s*=\s*(?[^""|^'']*)");//正则匹配获取编码(charset=‘Value’)
if(reg_charset.IsMatch(strHtml))//在网页源码中提取编码
{
strReturnedEncoding=reg_charset.Match(strHtml).Groups["charset"].Value;//取出网页真实编码
}
if(strReturnedEncoding!=string.Empty&&Encoding.GetEncoding(strReturnedEncoding)!=defaultEncoding)
{
strHtml=Encoding.GetEncoding(strReturnedEncoding).GetString(btHtml);//设置真正的编码进行源码下载
}
returnstrHtml;
}
catch
{
returnstring.Empty;
}
}
2,当提取出HTml源码后,需要的就是对源码进行处理
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("
","");
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("","");
Html=Html.Replace("","");
3,由于我们需要在TreeView中完整的将网页的数据显示出来,首先在页面上创建TreeView,然后创建节点
privateTreeNodepopulateTagNode(stringinputString,stringTitles)
{
TreeNodehtmlTagNode=newTreeNode();
try
{
MatchCollectionmatchesFound;
TreeNodehtmlSubTagNode;
stringsTag;
//通过正则表达式提取HTML
matchesFound=regex.Matches(inputString);
htmlTagNode.Text=Titles;
foreach(MatchmatchMadeinmatchesFound)
{
intMathchesMade=intMathchesMade+1;
sTag="";
htmlSubTagNode=populateTagNode(matchMade.Groups[3].Value,sTag);
htmlTagNode.Nodes.Add(htmlSubTagNode);
}
}
catch(Exceptionex)
{
MessageBox.Show("Error:"+ex.Message.ToString());
}
returnhtmlTagNode;
}