1.txt
dong xi cheng
xi dong cheng
wo ai beijing
tian an men
qiche
dong
dong
dong
2.txt
dong xi cheng
xi dong cheng
wo ai beijing
tian an men
qiche
dong
dong
dong import java.io.IOException;
import java.util.Random; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.map.RegexMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer; public class IGrep { public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration(); String dir_in = "hdfs://localhost:9000/input_grep";
String dir_out = "hdfs://localhost:9000/output_grep";
String reg = ".ng";//匹配三个字符的字符串,且以ng结尾。 conf.set(RegexMapper.PATTERN, reg);
conf.setInt(RegexMapper.GROUP, 0); Path in = new Path(dir_in);
Path tmp = new Path("grep-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
Path out = new Path(dir_out); try {
Job grepJob = new Job(conf, "grep-search"); grepJob.setJarByClass(IGrep.class); grepJob.setInputFormatClass(TextInputFormat.class);
grepJob.setMapperClass(RegexMapper.class);
grepJob.setCombinerClass(LongSumReducer.class);
grepJob.setPartitionerClass(HashPartitioner.class); grepJob.setMapOutputKeyClass(Text.class);
grepJob.setMapOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(grepJob, in); grepJob.setReducerClass(LongSumReducer.class);
// job.setNumReduceTasks(1);
grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class);
grepJob.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(grepJob, tmp); grepJob.waitForCompletion(true); Job sortJob = new Job(conf, "grep-sort"); sortJob.setJarByClass(IGrep.class); sortJob.setInputFormatClass(SequenceFileInputFormat.class);
sortJob.setMapperClass(InverseMapper.class);
FileInputFormat.addInputPath(sortJob, tmp); sortJob.setNumReduceTasks(1);【全局排序】
sortJob.setSortComparatorClass(LongWritable.DecreasingComparator.class);//逆序 FileOutputFormat.setOutputPath(sortJob, out); sortJob.waitForCompletion(true); } finally {
FileSystem.get(conf).delete(tmp, true);
}
}
} 输出结果:
10    ong
4    eng
2    ing

最新文章

  1. 同时闪烁多个要素代码(ArcEngine)
  2. voxel 与 pixel
  3. solrcloud 配置实践
  4. gO语言的安装和环境变量的配置
  5. VM设置BIOS延长时间
  6. NOSQL Benchmarks
  7. iOS学习之UITableView编辑
  8. poj 1330 Nearest Common Ancestors 裸的LCA
  9. Codevs No.2144 砝码称重2
  10. Poj 2159 / OpenJudge 2159 Ancient Cipher
  11. WCF-NAT模式访问
  12. linux学习初体验
  13. 使用STS创建springboot项目pom.xml文件报错org.apache.maven.archiver.MavenArchiver.getManifest
  14. Codeforces Round #423 (Div. 2, rated, based on VK Cup Finals) D. High Load 构造
  15. 使用Flask部署机器学习模型
  16. Softmax 损失-梯度计算
  17. 一段JS控制TD中图片的大小的代码
  18. C标准库函数中复杂的函数声明
  19. C++设计模式 之 “对象性能” 模式:Singleton、Flyweight
  20. 我眼中的PD(产品狗)

热门文章

  1. 转:实现Java Web程序的自动登录
  2. JSON 数据的系统解析
  3. [置顶] 提高生产力:开源Java工具包Jodd(Java的”瑞士军刀”)
  4. 将html中的br换行符转换为文本输入中的换行符(转)
  5. Hadoop流程---从tpch到hive
  6. 关于struts2中action请求会执行两次的问题
  7. linux下面安装和配置nginx
  8. Navicat Premium 自动备份mysql和sqlserver
  9. 在DataTable中更新、删除数据
  10. Android中dip,dp,sp,pt和px的区别