hadoop-job(mapReducer计算单词出现的个数)
2024-09-05 09:32:33
1.============map===============
package com.it18zhang.hadoop.mr; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /**
* Mapper
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
/**
* key : 行首偏移量,字节数,意义不大。
* value : 一行文本
*/
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//
String line = value.toString() ;
String[] arr = line.split(" "); Text keyOut = new Text() ;
IntWritable valueOut = new IntWritable(1) ;
for(String word : arr){
keyOut.set(word);
context.write(keyOut,valueOut);
}
}
}
2.============refucer===============
package com.it18zhang.hadoop.mr; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /**
* reducer
*/
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
* key : word
* values : 该key下聚合的value
*/
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0 ;
for(IntWritable iw : values){
count = count + iw.get() ;
}
context.write(key , new IntWritable(count));
}
}
3.============统计===============
package com.it18zhang.hadoop.mr; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException;
public class App {
public static void main(String[] args) throws Exception {
if(args == null || args.length<2){
throw new Exception("参数不足,需要2个参数");
}
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
//递归删除输出目录
fs.delete(new Path(args[1]),true); //创建一个作业
Job job = Job.getInstance(conf);
//调用job方法 名字随便期(word_count_add )
job.setJobName("word_count_add");
//获取类的路径
job.setJarByClass(App.class); // //需要计算的文件路径
// FileInputFormat.addInputPath(job,new Path("file:///Users/yangyanqing/godev/wc"));
// //计算后文件输出
// FileOutputFormat.setOutputPath(job,new Path("file:///Users/yangyanqing/godev/wc/out"));
//需要计算的文件路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//计算后文件输出
FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置mapper类和reducer类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class); //输出mapper类和reducer类的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class );
//设置readuce个数
job.setNumReduceTasks(1);
//开始作业
job.waitForCompletion(true);
}
}
最新文章
- ArtDialog文档
- CGContextAddCurveToPoint 的深入理解
- mysql常见的运算符及使用
- 老项目的#iPhone6于iPhone6Plus适配#iPhone6分辨率与适配
- 了解CentOS及周边
- bzoj 3196 Tyvj 1730 二逼平衡树(线段树套名次树)
- RHCA442学习笔记-Unit11内存回收
- Windows 10 IoT Serials 4 - 如何在树莓派上使用Cortana语音助手
- Javascript中的数组去重-indexof方法
- 如何去掉修改Joomla、joomlart及其模版版权、标志、图标的方法
- Swift基础之CoreData的使用
- postman的安装与使用(模拟请求)
- Kubernetes 1.10.0离线安装
- Phpstorm数组对齐设置
- netty 在线教程
- echarts中dataZoom的使用
- 【接口时序】2、Verilog实现流水灯及与C语言的对比
- (String中)正则表达式使用如下
- POJ3159 Candies
- 【转】Spark源码分析之-deploy模块