job on hadoop
//http://distributed-agility.blogspot.com/2010/01/hadoop-0201-example-inverted-line-index.html
//https://portal.futuregrid.org/manual/hadoop-wordcount
importorg.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.conf.Configured;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.io.Text;
importorg.apache.hadoop.mapreduce.Job;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
importorg.apache.hadoop.util.Tool;
importorg.apache.hadoop.util.ToolRunner;
/**
*LineIndexerCreatesaninvertedindexoverallthewordsinadocumentcorpus,mappingeachobservedwordtoalist
*offilename@offsetlocationswhereitoccurs.
*/
publicclassLineIndexerextendsConfiguredimplementsTool{
//wheretoputthedatainhdfswhenwe'redone
privatestaticfinalStringOUTPUT_PATH="output";
//wheretoreadthedatafrom.
privatestaticfinalStringINPUT_PATH="input";
publicstaticvoidmain(String[]args)throwsException{
intres=ToolRunner.run(newConfiguration(),newLineIndexer(),args);
System.exit(res);
}
publicintrun(String[]args)throwsException{
Configurationconf=getConf();
Jobjob=newJob(conf,"LineIndexer1");
job.setJarByClass(LineIndexer.class);
job.setMapperClass(LineIndexMapper.class);
job.setReducerClass(LineIndexReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job,newPath(INPUT_PATH));
FileOutputFormat.setOutputPath(job,newPath(OUTPUT_PATH));
returnjob.waitForCompletion(true)?0:1;
}
}
Afterupdating,makesuretorungenerateanewjar,removeanythingunderthedirectory"output"(sincetheprogramdoesnotcleanthatup),andexecutethenewversion.
training@training-vm:~/git/exercises/shakespeare$antjar
Buildfile:build.xml
compile:
[javac]Compiling4sourcefilesto/home/training/git/exercises/shakespeare/bin
jar:
[jar]Buildingjar:/home/training/git/exercises/shakespeare/indexer.jar
BUILDSUCCESSFUL
Totaltime:1second
Ihaveadded2ASCIIbooksintheinputdirectory:theworksfromLeonardoDaVinciandthefirstvolumeofthebook"Theoutlineofscience".
training@training-vm:~/git/exercises/shakespeare$hadoopfs-lsinput
Found3items
-rw-r--r--1trainingsupergroup53427612009-12-3011:57/user/training/input/all-shakespeare
-rw-r--r--1trainingsupergroup14277692010-01-0417:42/user/training/input/leornardo-davinci-all.txt
-rw-r--r--1trainingsupergroup6747622010-01-0417:42/user/training/input/the-outline-of-science-vol1.txt
Theexecutionandoutputofrunningthisexampleisshownasfollows.
training@training-vm:~/git/exercises/shakespeare$hadoopjarindexer.jarindex.LineIndexer
10/01/0421:11:55INFOinput.FileInputFormat:Totalinputpathstoprocess:3
10/01/0421:11:56INFOmapred.JobClient:Runningjob:job_200912301017_0017
10/01/0421:11:57INFOmapred.JobClient:map0%reduce0%