在使用parquet-hadoop.jar包解析parquet文件时,遇到decimal类型的数据为乱码,具体解决方法如下:

使用parquet-Hadoop.jar解析httpfs服务提供的parquet文件,代码如下:

@Test
public void httpfsReadHiveParquetFile() throws Exception { Path path = new Path("webhdfs://s128:14000/wbd_test/parq1.0.parq");
Configuration conf = new Configuration();
conf.set("fs.webhdfs.impl", WebHdfsFileSystem.class.getName());
Map<String, String> urlParams = new HashMap<>();
urlParams.put("user.token", "7hmsNJIget0eGO5maKQ=sfds");
conf.set(WebHdfsFileSystem.HTTPFS_URL_PARAM, JSON.toJSONString(urlParams));
FileSystem fs = path.getFileSystem(conf); FileStatus fileStatus = fs.getFileStatus(path); InputFile inputFile = HadoopInputFile.fromStatus(fileStatus, conf);
GroupReadSupport readSupport = new GroupReadSupport();
ParquetReader.Builder<Group> reader= ParquetReader.read(inputFile);
reader.withConf(conf);
ParquetReader<Group> build=reader.build(); Group line=null;
line=build.read(); Map<String,String> fieldTypeMap = new HashMap<String, String>(); if (line != null){
List<Type> typeList = line.getType().getFields();
ParquetInputFormat inputFormat = new ParquetInputFormat();
for(Type type : typeList){
System.out.print(type.getName()+"("+type.asPrimitiveType().getPrimitiveTypeName().name()+")\t\t");
}
System.out.println();
System.out.println("-----------------------------------------------------------------------------------------------------------");
do{
for (Type type : typeList){
System.out.print(converterType2Java(line, type)+"\t\t");
}
System.out.println();
}while ((line=build.read())!=null);
} System.out.println("It is over !"); } public static String converterType2Java(Group line, Type type) {
String value = null;
String fieldType = type.asPrimitiveType().getPrimitiveTypeName().name();
String fieldName = type.getName();
int repetition = line.getFieldRepetitionCount(type.getName());
if (repetition == 0){
return value;
} switch (fieldType){
case "BOOLEAN":
value = String.valueOf(line.getBoolean(fieldName, 0));
break;
case "INT32":
value = String.valueOf(line.getInteger(fieldName, 0));
break;
case "INT64":
value = String.valueOf(line.getLong(fieldName, 0));
break;
case "INT96":
value = String.valueOf(getTimestampMillis(line.getInt96(fieldName, 0)));
break;
case "FLOAT":
value = String.valueOf(line.getFloat(fieldName, 0));
break;
case "DOUBLE":
value = String.valueOf(line.getDouble(fieldName, 0));
break;
case "FIXED_LEN_BYTE_ARRAY":
if (type.getOriginalType() != null && type.getOriginalType().name().equals("DECIMAL")){
value = String.valueOf(binaryToDecimal(type.asPrimitiveType().getDecimalMetadata().getPrecision(), type.asPrimitiveType().getDecimalMetadata().getScale(), line.getBinary(fieldName, 0).getBytes()));
int precision = type.asPrimitiveType().getDecimalMetadata().getPrecision();
int scale = type.asPrimitiveType().getDecimalMetadata().getScale();
BigDecimal decimalValue = binaryToDecimal(precision, scale, line.getBinary(fieldName, 0).getBytes());
String precisionFormat = String.join("", Collections.nCopies(precision-1, "#"));
String scaleFrmat = String.join("", Collections.nCopies(scale,"0"));
String format = precisionFormat + "0."+ scaleFrmat;
DecimalFormat decimalFormat = new DecimalFormat(format); value = decimalFormat.format(decimalValue);
}
break;
case "BINARY":
value = line.getString(fieldName, 0);
break;
default:
value = line.getString(fieldName, 0);
}
return value;
} public static long getTimestampMillis(Binary timestampBinary)
{
if (timestampBinary.length() != 12) {
return 0;
}
byte[] bytes = timestampBinary.getBytes(); // little endian encoding - need to invert byte order
long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]); return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
} private static long julianDayToMillis(int julianDay)
{
return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
} static BigDecimal binaryToDecimal(int precision, int scale, byte[] bytes) {
/*
* Precision <= 18 checks for the max number of digits for an unscaled long,
* else treat with big integer conversion
*/
if (precision <= 18) { int start = 0;//buffer.arrayOffset() + buffer.position();
int end = bytes.length; //buffer.arrayOffset() + buffer.limit();
long unscaled = 0L;
int i = start;
while ( i < end ) {
unscaled = ( unscaled << 8 | bytes[i] & 0xff );
i++;
}
int bits = 8*(end - start);
long unscaledNew = (unscaled << (64 - bits)) >> (64 - bits);
BigDecimal result;
if (unscaledNew <= -pow(10,18) || unscaledNew >= pow(10,18)) {
result = new BigDecimal(unscaledNew);
// System.out.println(result);
} else {
result = BigDecimal.valueOf(unscaledNew / pow(10,scale));
// System.out.println(result);
}
return result;
} else {
BigDecimal result = new BigDecimal(new BigInteger(bytes), scale);
// System.out.println(result);
return result;
}
}

parquet文件timestamp类型实际为INT96类型,decimal实际为FIXED_LEN_BYTE_ARRAY二进制类型,要想得到原来的数据,都需要进行转换,在网上很少能找到相关问题,希望对其他人有所帮助

最新文章

  1. rabbitMQ学习(三)
  2. redis3.0.0 集群安装详细步骤
  3. Python全栈之路---运算符与基本的数据结构
  4. 安装vs2013以后,链接数据库总是报内存损坏,无法写入的错误
  5. [转载] LinkedIn架构这十年
  6. Make the “Check out” function available in the office document opened with Document ID link
  7. C++类型转化
  8. php 操作数组 (合并,拆分,追加,查找,删除等)
  9. JavaBean以及MVC模式
  10. 开发自定义View
  11. USACO Section 2.1 Sorting a Three-Valued Sequence
  12. Blackboard - 百度百科
  13. Scrollview中嵌套ListView(自定义组件解决)
  14. copy&amp;deepcopy
  15. 让网站通过Https访问
  16. 【问题解决】jhipster-registry-master空白页
  17. 一切皆Socket
  18. 潭州课堂25班:Ph201805201 WEB 之 JS 第六课 (课堂笔记)
  19. bzoj 2243: [SDOI2011]染色 线段树区间合并+树链剖分
  20. MUI - IOS系统,相册选择照片后,点击确定按钮无反应

热门文章

  1. vue 打包部署到服务器上 配置nginx访问
  2. 转:VMware 15 安装 MAC OS 10.13 原版(详细图文教程)
  3. .NET DLL 保护措施详解(非混淆加密加壳)
  4. StringUtils字符串工具类左侧补齐(leftPad)、右侧补齐(rightPad)、左右两侧补齐(center)工具方法
  5. C++学习 之 初识命名空间
  6. 自定义View绘制简单的圆环的实现
  7. androidstudio导入新项目build tools不符合问题解决
  8. Mysqlfunc.c
  9. 获取packageName和startActivity
  10. Implementing a Dynamic Vector (Array) in C(使用c实现动态数组Vector)