将HDFS中的数据导入HBase

hadoop

2014-03-02

将HDFS中的数据导入HBase

package Hbase;

import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.Hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

public class BatchImport {

public static void main(String[] args) throws Exception {
final Configuration configuration = new Configuration();
//设置zookeeper
configuration.set("hbase.zookeeper.quorum", "hadoop1");
//设置hbase表名称
configuration.set(TableOutputFormat.OUTPUT_TABLE, "wlan_log");
//将该值改大，防止hbase超时退出
configuration.set("dfs.socket.timeout", "180000");

final Job job = new Job(configuration, "HBaseBatchImport");

job.setMapperClass(BatchImportMapper.class);
job.setReducerClass(BatchImportReducer.class);
//设置map的输出，不设置reduce的输出类型
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);

job.setInputFormatClass(TextInputFormat.class);
//不再设置输出路径，而是设置输出格式类型
job.setOutputFormatClass(TableOutputFormat.class);

FileInputFormat.setInputPaths(job, "hdfs://hadoop1:9000/HTTP*");

job.waitForCompletion(true);
}

static class BatchImportMapper extends Mapper<LongWritable, Text, LongWritable, Text>{
SimpleDateFormat dateformat1=new SimpleDateFormat("yyyyMMddHHmmss");
Text v2 = new Text();

protected void map(LongWritable key, Text value, Context context) throws java.io.IOException ,InterruptedException {
final String[] splited = value.toString().split("\t");
try {
final Date date = new Date(Long.parseLong(splited[0].trim()));
final String dateFormat = dateformat1.format(date);
String rowKey = splited[1]+":"+dateFormat;
v2.set(rowKey+"\t"+value.toString());
context.write(key, v2);
} catch (NumberFormatException e) {
final Counter counter = context.getCounter("BatchImport", "ErrorFormat");
counter.increment(1L);
System.out.println("出错了"+splited[0]+" "+e.getMessage());
}
};
}

static class BatchImportReducer extends TableReducer<LongWritable, Text, NullWritable>{
protected void reduce(LongWritable key, java.lang.Iterable<Text> values, Context context) throws java.io.IOException ,InterruptedException {
for (Text text : values) {
final String[] splited = text.toString().split("\t");

final Put put = new Put(Bytes.toBytes(splited[0]));
put.add(Bytes.toBytes("cf"), Bytes.toBytes("date"), Bytes.toBytes(splited[1]));
//省略其他字段，调用put.add(....)即可
context.write(NullWritable.get(), put);
}
};
}

}

HBase 的详细介绍：请点这里
HBase 的下载地址：请点这里

相关阅读：

hdfs hbase hadoop

安科网

将HDFS中的数据导入HBase

hadoop

hadoop

相关推荐

hdfs、hive、hbase的搭建总结

hbase设置ttl后出现坏块，重启后master abort 问题梳理

centos7安装hbase集群

hadoop伪分布式环境搭建

hadoop框架三大组件hdfs、mapreduce、yarn 内容

hadoop集群的启动与停止

hadoop创建目录

[AWS][大数据][Hadoop] 使用EMR做大数据分析

Hadoop

大数据期末复习重点

Hadoop之hadoop fs和hdfs dfs、hdfs fs三者区别

hadoop两个namenode都是standby问题

Hadoop之HDFS入门实战

hadoop 数据处理总结

Hadoop简介

hadoop hdfs csv导入hive表

HDFS分布式存储中NameNode 和DataNode 有什么区别？

2020年首个存储挖矿项目HDFS是什么？

Spark RDD

HDFS

hadoop