mapreduce 开发以及部署

GMCWXH

2015-01-16

前面几篇文章的梳理让我对hadoop新yarn 框架有了一个大概的认识，今天开始回归老本行---开始coding。

因为涉及到linux系统部署，所以今天安了一个linux 的 lszrz 插件

下载并解压缩 lrzsz-0.12.20.tar.gz

安装之前，需要检查系统是否有gcc 若没有请安装 yum install gcc

安装lrzsz ./configure && make && make install
上面安装过程默认把lsz和lrz安装到了/usr/local/bin/目录下, 下面创建软链接, 并命名为rz/sz:
# cd /usr/bin
# ln -s /usr/local/bin/lrz rz
# ln -s /usr/local/bin/lsz sz

开始写代码首先导入相应的包
commons-beanutils-1.7.0.jar
commons-beanutils-core-1.8.0.jar
commons-cli-1.2.jar
commons-codec-1.4.jar
commons-collections-3.2.1.jar
commons-compress-1.4.1.jar
commons-configuration-1.6.jar
commons-digester-1.8.jar
commons-el-1.0.jar
commons-httpclient-3.1.jar
commons-io-2.4.jar
commons-lang-2.6.jar
commons-logging-1.0.4.jar
commons-logging.jar
guava-11.0.2.jar
hadoop-common-2.5.2.jar
hadoop-mapreduce-client-core-2.5.2.jar
log4j-1.2.14.jar
mockito-all-1.8.5.jar
mrunit-1.1.0-hadoop2.jar
powermock-mockito-1.4.9-full.jar

在此我们写一个分析每年最高气温的任务，气温数据格式如下

1901 01 01 06 -38 -9999 10200 270 159 8 -9999 -9999

其中1901 为年份 01 01 为月份 -38为气温

开始编写mapper 代码如下

package com.snwz.mapreduce;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;

public class MyMapper {

	private static final Log logger = LogFactory.getLog(MyMapper.class);
	public static class myMapper extends Mapper<Object, Text, IntWritable, IntWritable> {
		
		private static final IntWritable one = new IntWritable(1);
		private IntWritable key = new IntWritable(); 
		private IntWritable record = new IntWritable();
		private IntWritable year = new IntWritable(); 
		private Context context; 
		/**
		 * key 数据偏移量
		 * value 数据
		 * context 上下文对象
		 * 
		 * 注：由于要计算每年最高的气温，所以在此我们将年份作为key 气温作为value
		 * 都作为整形来计算
		 */
	    @Override
		protected void map(Object key, Text value, Context context)
	    		throws IOException, InterruptedException {
			String line = value.toString();
	    	if("".equals(line)||null==line){
	    		return;
	    	}
	    	line = line.replace(" ", "%");
	    	String array[] = line.split("%");
	    	if(array==null || array.length<22){
	    		logger.info("line : "+key+" array length error "+line);
	    		return;
	    	}
	    	if("-9999".equals(array[5])){
	    		logger.info("line : "+key+" temperature error -9999");
	    		return;
	    	}
	    	year.set(Integer.parseInt(array[0]));
	    	int temperature = Integer.parseInt(array[9]);
	    	record.set(temperature);
	    	context.write(year, record); 
	    }
	}
	
	
	public static void main(String[] args) {
		
	}
	
}

reducer 代码如下：

package com.snwz.mapreduce;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReducer {
	
	public static class myReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
		@Override
		protected void reduce(IntWritable key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			//当年最大气温
			int maxTem = 0;
			for(IntWritable i : values){
				maxTem = Math.max(maxTem, i.get());
			}
			context.write(key, new IntWritable(maxTem));
		}
	}

}

完成之后我们通过一个方便的测试工具mrunit 来进行测试

package com.snwz.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;

import com.snwz.mapreduce.MyMapper.myMapper;
import com.snwz.mapreduce.MyReducer.myReducer;

public class MpTest {

	MapDriver<Object, Text, IntWritable, IntWritable> mapDriver;  
	ReduceDriver<IntWritable, IntWritable, IntWritable, IntWritable> reduceDriver;  
	MapReduceDriver<IntWritable, Text, IntWritable, IntWritable, IntWritable, IntWritable> mapReduceDriver;
	
	
	@Before
	public void setUp(){
		System.setProperty("hadoop.home.dir", "E:\\hadoop\\hadoop-2.5.2");
		myMapper  mapper = new myMapper();
		myReducer reducer = new myReducer();
		mapDriver = MapDriver.newMapDriver(mapper);
		reduceDriver = ReduceDriver.newReduceDriver(reducer);
	}
	
	 @Test  
	 public void testMapper() throws IOException {
		 mapDriver.withInput(new LongWritable(), 
				 new Text("1901 01 01 06   -78 -9999 10200   270   159     8 -9999 -9999"));    
		 mapDriver.withOutput(new IntWritable(1901), new IntWritable(-78));    
		 mapDriver.runTest();
     }

     @Test
     public void testReducer() throws IOException {
    	 List<IntWritable> values = new ArrayList<IntWritable>();
    	 values.add(new IntWritable(1));
    	 values.add(new IntWritable(2));
    	 values.add(new IntWritable(-48));
    	 values.add(new IntWritable(-12));
    	 reduceDriver.withInput(new IntWritable(1940), values)
    	 .withOutput(new IntWritable(1940), new IntWritable(2))
    	 .runTest();
     }
}

测试通过后开始编写job 任务

package com.snwz.mapreduce;

import java.io.File;
import java.util.Date;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.snwz.mapreduce.MyMapper.myMapper;
import com.snwz.mapreduce.MyReducer.myReducer;

public class MyJob extends Configured implements Tool {
	
	private static final Log logger = LogFactory.getLog(MyJob.class);

	public static void main(String[] args) {
		try {
			int res;
			res = ToolRunner.run(new Configuration(), new MyJob(), args);
			System.exit(res);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public int run(String[] args) throws Exception {
		if (args == null || args.length != 2) {
			System.out.println("need inputpath and outputpath");
			return 1;
		}
		// hdfs 输入路径
		String inputpath = args[0];
		// reduce 结果集输出路径
		String outputpath = args[1];
		
		String shortin = args[0]; 
		String shortout = args[1]; 
		if (shortin.indexOf(File.separator) >= 0) 
		shortin = shortin.substring(shortin.lastIndexOf(File.separator)); 
		if (shortout.indexOf(File.separator) >= 0) 
		shortout = shortout.substring(shortout.lastIndexOf(File.separator)); 

		File inputdir = new File(inputpath);
		File outputdir = new File(outputpath);
		if (!inputdir.exists() || !inputdir.isDirectory()) {
			System.out.println("inputpath not exist or isn't dir!");
			return 0;
		}
		if (!outputdir.exists()) {
			new File(outputpath).mkdirs();
		}
		
		Job job = new Job(new JobConf());
		job.setJarByClass(MyJob.class); 
		job.setJobName("MyJob"); 
		job.setOutputKeyClass(IntWritable.class);// 输出的 key 类型，在 OutputFormat 会检查
	    job.setOutputValueClass(IntWritable.class); // 输出的 value 类型，在 OutputFormat 会检查
	    job.setMapperClass(myMapper.class); 
	    job.setCombinerClass(myReducer.class); 
	    job.setReducerClass(myReducer.class); 
	    FileInputFormat.setInputPaths(job, new Path(shortin));//hdfs 中的输入路径
	    FileOutputFormat.setOutputPath(job,new Path(shortout));//hdfs 中输出路径

	    Date startTime = new Date(); 
	    logger.info("Job started: " + startTime); 
	    job.waitForCompletion(true);    
	    Date end_time = new Date(); 
	    logger.info("Job ended: " + end_time); 
	    logger.info("The job took " + 
	    (end_time.getTime() - startTime.getTime()) /1000 + " seconds."); 
	    return 0; 

	}
}

编写完成后，一个简单的mapreduce就编写完成了，然后通过打包工具将编写的类打成jar包，关联的jar就不需要了，因为hadoop里面的 jar命令会自己去关联相应的jar文件。，打包时 main 方法指定为job即可，将包存放在hadoop根目录，然后将需要分析的文件存放在hdfs系统

清空输出路径 ./bin/hadoop dfs -rmr /output

建立输入路径 ./bin/hadoop dfs -mkdir /input

上传文件 ./bin/hadoop dfs -copyFromLocal 本地路径 hdfs路径

运行jar文件 ./bin/hadoop jar myJob.jar /input /output

运行完成后进入输出路径查看输出结果即可。

mapreduce apache hadoop

安科网

mapreduce 开发以及部署

GMCWXH

GMCWXH

相关推荐

hadoop框架三大组件hdfs、mapreduce、yarn 内容

JStorm介绍

mapreduce求topN

HBase与MapReduce交互

分布式计算框架——MapReduce

Hadoop之MapReduce学习(三)之ip去重、MaxScore示例、TotalScoreMapper示例

HBase和MapReduce

MapReduce(分布式计算)_01

MapReduce编程模型

bitmap、Trie、数据库索引、倒排索引、外排序、Mapreduce

MapReduce的使用

Hadoop基础---MapReduce对数据进行排序

Hadoop——MapReduce介绍

Hadoop架构及集群

MapReduce编程实战（2）-词频统计结果存入mysql数据库

MapReduce和Hive学习文档链接学习顺序

Hadoop学习之路(7)MapReduce自定义排序

python| MongoDB聚合(count、distinct、group、MapReduce)

Hadoop

hadoop自带性能测试

GMCWXH