抓取口碑网店铺资料
呵呵,只为自己玩,哈哈。
技术难度:
1)快速高效的抓取记录,并去重,和以后的增量抓取。
2)因为口碑网的联系方式是图片的,如何批量的完成OCR的转换
本文只是一个实验,不建议使用在项目当中,如下是部分代码。
涉及的开源代码:
AspriseOCR
资源包,把crawler4j所有jar包放在你的应用目录中。
- Berkeley DB Java Edition 4.0.71 or higher
- fastutil 5.1.5
- DSI Utilities 1.0.10 or higher
- Apache HttpClient 4.0.1
- Apache Log4j 1.2.15
- Apache Commons Logging 1.1.1
- Apache Commons Codec 1.4
如上的JAR包都必须包括在你的项目lib当中,否则会出错。
第一步,抓取口碑网店铺信息。 如下是抓取的部分代码:
主程序入口:Controller.java
package com.aragon.crawdata;
import org.junit.Test;
import edu.uci.ics.crawler4j.crawler.CrawlController;
public class Controller {
private static String CRAWSITE = "http://beijing.koubei.com/";
@Test
public void test(){
CrawlController controller;
try {
controller = new CrawlController("/data/crawl/root");
controller.addSeed(CRAWSITE);
controller.start(MyCrawler.class, 3);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}抓取主程序 MyCrawler.java
package com.aragon.crawdata;
import java.util.ArrayList;
import java.util.regex.Pattern;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.url.WebURL;
public class MyCrawler extends WebCrawler {
private static String CRAWSITE = "http://beijing.koubei.com/store/";
Pattern filters = Pattern.compile(".(detail--storeId-*)$");
SaveInDb savein = new SaveInDb();
public MyCrawler() {
}
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
if (filters.matcher(href).matches()) {
return false;
}
if (href.startsWith(CRAWSITE)) {
return true;
}
return false;
}
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
System.out.println("Url:" + url);
Template4Koubei koubei = new Template4Koubei();
try {
CrawDataModel model = koubei.translationData(page.getHTML());
if (model != null) {
//保存在数据库当中。
savein
.update("insert into info(type,name,address,tel1,tel2,tel3,otherinfo,othertype,ownername) value ('"
+ model.getType()
+ "','"
+ model.getName()
+ "','"
+ model.getAddress()
+ "','"
+ model.getTel1()
+ "','"
+ model.getTel2()
+ "','"
+ model.getTel3()
+ "','"
+ model.getOtherInfo()
+ "','"
+ model.getOtherType()
+ "','"
+ model.getOwnerName() + "')");
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String text = page.getText();
ArrayList<WebURL> links = page.getURLs();
}
}真正解析口碑网店铺的类Template4Koubei.java
package com.aragon.crawdata;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
public class Template4Koubei {
private static String NAME_TAG = "title-wrap yk-fix-float";// 名称
private static String TYPE_TAG = "detail-info-item";// 类型
private static String ADDRESS_TAG = "info yk-fix-float";// 地址
private static String TEL_TAG = "detail-info-item";// 电话,图像
private static String MOB_TAG = "detail-info-item";
private static String DETAIL_TAG = "detail-intro";
private static String OTHERINFO_TAG = "detail-info-item";
public CrawDataModel translationData(String webHtml) throws Exception {
CrawDataModel model = new CrawDataModel();
Object[] name = findCrawDataByCol(webHtml, NAME_TAG);
if (name != null && name[0] != null) {
model.setName(StringUtils.deleteWhitespace(name[0].toString()));
Object[] type = findCrawDataByCol(webHtml, TYPE_TAG);
if (type != null) {
model.setType(StringUtils.deleteWhitespace(type[0].toString()));
}
Object[] tel = findCrawDataByImageFilter(webHtml, TEL_TAG);
if (tel != null) {
for (int i = 0; i < tel.length; i++) {
if (i == 0) {
model.setTel1(StringUtils.deleteWhitespace(tel[0]
.toString()));
}
if (i == 1) {
model.setTel2(StringUtils.deleteWhitespace(tel[1]
.toString()));
}
if (i == 2) {
model.setTel3(StringUtils.deleteWhitespace(tel[2]
.toString()));
}
}
}
Object[] otherType = findCrawDataByCol(webHtml, OTHERINFO_TAG);
String otherTypeString = "";
if (otherType != null) {
for (int i = 0; i < otherType.length; i++) {
if (i < 3) {
otherTypeString += StringUtils
.deleteWhitespace(otherType[i].toString());
}
}
model.setOtherType(otherTypeString);
}
Object[] address = findCrawDataByCol(webHtml, ADDRESS_TAG);
if (address != null) {
model.setAddress(StringUtils.deleteWhitespace(address[0]
.toString()));
}
Object[] detail = findCrawDataByCol(webHtml, DETAIL_TAG);
if (detail != null) {
model.setOtherInfo(StringUtils.deleteWhitespace(detail[0]
.toString()));
}
} else {
return null;
}
// check address
if (model.getAddress() != null) {
if (model.getAddress().indexOf("电话") > 0) {
model.setOtherType(model.getAddress().substring(
model.getAddress().indexOf("电话")));
model.setAddress(model.getAddress().substring(0,
model.getAddress().indexOf("电话")));
}
}
return model;
}
public Object[] findCrawDataByCol(String webHtml, String Tag)
throws Exception {
String name = "";
List<String> resultTextList = new ArrayList<String>();
Parser parser = new Parser();
parser.setInputHTML(webHtml);
NodeFilter filter = new HasAttributeFilter("class", Tag);
/* NodeFilter filter 就是要解析的过滤器,实现有好多种,我采用的属性过滤,其他more api */
NodeList nodeList = parser.extractAllNodesThatMatch(filter);
/*
* extractAllNodesThatAre(class)已经不被推荐使用,在1.6版本中,我感到更加体形了灵活性.更好的适用了自定义的tag
*/
if (nodeList == null)
return null;
if (nodeList.size() == 0)
return null;
// System.out.println("start ============== ,size = "
// + nodeList.size());
Node[] nodes = nodeList.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i]; /* 得到所以符合的节点,类型化做对应的标签类 */
if (node instanceof Span) {
Span spanTag = (Span) node;
line = spanTag.toPlainTextString();
} else if (node instanceof TableColumn) {
TableColumn tableTag = (TableColumn) node;
line = tableTag.toPlainTextString();
} else if (node instanceof Div) {
Div divTag = (Div) node;
line = divTag.toPlainTextString();
}
if (StringUtil.isTrimEmpty(line)) {
continue;
} else {
resultTextList.add(line);
}
}
return resultTextList.toArray();
// return name;
}
public Object[] findCrawDataByImageFilter(String webHtml, String Tag)
throws Exception {
List<String> resultTextList = new ArrayList<String>();
Parser parser = new Parser();
parser.setInputHTML(webHtml);
NodeFilter filter = new NodeClassFilter(ImageTag.class);
/* NodeFilter filter 就是要解析的过滤器,实现有好多种,我采用的属性过滤,其他more api */
NodeList nodeList = parser.extractAllNodesThatMatch(filter);
/*
* extractAllNodesThatAre(class)已经不被推荐使用,在1.6版本中,我感到更加体形了灵活性.更好的适用了自定义的tag
*/
if (nodeList == null)
return null;
if (nodeList.size() == 0)
return null;
// System.out.println("start ============== ,size = "
// + nodeList.size());
Node[] nodes = nodeList.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i]; /* 得到所以符合的节点,类型化做对应的标签类 */
if (node.getParent().getText().indexOf(Tag) > 0) {
ImageTag imageTag = (ImageTag) node;
line = imageTag.getImageURL();
}
if (StringUtil.isTrimEmpty(line)) {
continue;
} else {
resultTextList.add(line);
}
}
return resultTextList.toArray();
// return name;
}
}Model对象:CrawDataModel.java
package com.aragon.crawdata;
import java.io.Serializable;
public class CrawDataModel implements Serializable{
private Integer id;
private String type;
private String name;
private String address;
private String tel1;
private String tel2;
private String tel3;
private String ownerName;
private String otherInfo;
private String otherType;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getTel1() {
return tel1;
}
public void setTel1(String tel1) {
this.tel1 = tel1;
}
public String getTel2() {
return tel2;
}
public void setTel2(String tel2) {
this.tel2 = tel2;
}
public String getTel3() {
return tel3;
}
public void setTel3(String tel3) {
this.tel3 = tel3;
}
public String getOwnerName() {
return ownerName;
}
public void setOwnerName(String ownerName) {
this.ownerName = ownerName;
}
public String getOtherInfo() {
return otherInfo;
}
public void setOtherInfo(String otherInfo) {
this.otherInfo = otherInfo;
}
public String getOtherType() {
return otherType;
}
public void setOtherType(String otherType) {
this.otherType = otherType;
}
} 哈哈,如上的步骤就可以把口碑网的店铺信息抓入你的数据库当中,我在恶劣的2M宽带条件下,一个小时抓了2000多条信息,当然了,没有加入去重的功能。
第二步:把电话,手机信息转换成指定的数字号码
从口碑网抓取的电话,手机信息都是图片信息,如何把他转换成数字号码,我们使用AspriseOCR。网上破解的方法很多。大家可以google一下。
如下是他的TEST实验。因为我的应用代码太多,所以就粘一个Test,从技术上没有问题。
package com.aragon.crawdata;
import java.io.File;
/**
*
* 本程序实现功能:给出一个图片的具体网络地址,把该图片的进行解析,解析后把图片内容以字符串形式进行返回
*/
public class RecognisePicture {
/**
*
* create date:2009-5-22 author:Administrator
*
*
*
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
recognise("http://count.koubei.com/showphone/showphone.php?f=jpg&w=96&h=10&bc=255,255,255&fc=0,0,0&fs=10&fn=arial&phone=MTI5NDU5ODg0Mw%3D%3D%23OQ532L8m6okoAzY6".replaceAll("&", "&"));
}
/**
*
* 解析指定网络地址的图片信息 create date:2009-5-22 author:Administrator
*
*
*
* @param fromUrl
*
* @return
*/
public static String recognise(String fromUrl) {
String result = "";
// 下载图片文件到本地磁盘
File file = DownLoadWithUrl.downLoadFile(fromUrl);
if (file != null) {
// 解析下载到本地磁盘文件
result = ParseJPEG_withOCR.getRecogniseStr(file);
// 删除下载到本地磁盘的文件
DownLoadWithUrl.delFile(file);
}
System.out.println("输出的电话号码是:"+result);
return result;
}
}如下是实验结果:
输出的电话号码是:01O 51402688
哈哈,这样一来。是不是能减少很多MM的工作量呀。enjoy it!