中文字符串在c/c++中表示为字节序列,在分词的时候需要根据不同的编码方式进行分词,一般分词器需要转换成统一的编码方式再进行转换,有些分词器如ICTCLAS在分词的时候可以不显示定义编码方式,可以检测字符串的编码方式再进行转换,本文就项目中用到的几种编码转换方式进行总结,主要利用了iconv进行编码转换。

const bchar_t zero[1] = {L'\0'}; void gbk_utf16le(char* inbuf, size_t inlen, char* outbuf) { iconv_t cd = iconv_open( "UTF-16LE//IGNORE", "GBK"); bzero( outbuf, inlen*4); char *in = inbuf; char *out = outbuf; size_t outlen = inlen*4; iconv(cd, &in, (size_t *)&inlen, &out,&outlen); if( outlen>=sizeof(bchar_t) ) *((bchar_t *) out) = L'\0'; iconv_close(cd); } void utf16le_gbk(char* inbuf, size_t inlen, char* outbuf) { iconv_t cd = iconv_open( "GBK//IGNORE", "UTF-16LE"); bzero( outbuf, inlen*4); char *in = inbuf; char *out = outbuf; size_t outlen = inlen*4; iconv(cd, &in, (size_t *)&inlen, &out,&outlen); if( outlen>=sizeof(char) ) *((char *) out) = '\0'; iconv_close(cd); } void utf16le_utf8(char* inbuf, size_t inlen, char* outbuf) { iconv_t cd = iconv_open( "UTF-8//IGNORE", "UTF-16LE"); bzero( outbuf, inlen*4); char *in = inbuf; char *out = outbuf; size_t outlen = inlen*4; iconv(cd, &in, (size_t *)&inlen, &out,&outlen); if( outlen>=sizeof(char) ) *((char *) out) = '\0'; iconv_close(cd); } void gbk_utf16le(char* inbuf, size_t inlen, char* outbuf,uint32_t& outbuflen) { iconv_t cd = iconv_open( "UTF-16LE//IGNORE", "GBK"); bzero( outbuf, inlen*4); char *in = inbuf; char *out = outbuf; size_t outlen = inlen*4; outbuflen = outlen; iconv(cd, &in, (size_t *)&inlen, &out,&outlen); //outlen is bytes of outbuf not used outbuflen -= outlen; if( outlen>=sizeof(bchar_t) ) *((bchar_t *) out) = L'\0'; iconv_close(cd); } void utf8_gbk(char* inbuf, size_t inlen, char* outbuf) { iconv_t cd = iconv_open( "GBK//IGNORE", "UTF-8"); bzero( outbuf, inlen*4); char *in = inbuf; char *out = outbuf; size_t outlen = inlen*4; iconv(cd, &in, (size_t *)&inlen, &out,&outlen); if( outlen>=sizeof(char) ) *((char *) out) = '\0'; iconv_close(cd); } /*将char类型转换为uint16类型,主要是将2个字节的char拼成1个16进制数*/ void ch_uint16(char* inbuf, int& inlen, uint16_t* outbuf, int& outlen) { char *in = inbuf; uint16_t *out = outbuf; int k = 0; outlen = 0; char temp[20]; memset(temp,0,20); for(;k+2<=inlen;k+=2){ if( int(in[k]&0xFF) >15){ sprintf(temp,"%x%x",in[k+1]&0xFF ,in[k]&0xFF);} else{ sprintf(temp,"%x%d%x",in[k+1]&0xFF,0,in[k]&0xFF);} uint16_t ut; sscanf(temp,"%x",&ut); out[outlen] = ut ; outlen++ ; } out[outlen] = L'\0'; }

最新文章

  1. 局部变量&amp;&amp;malloc函数&amp;&amp;生命周期的一些见解
  2. SDL1.2学习
  3. C语言 百炼成钢8
  4. mysql:学习学习~
  5. jQuery 源码分析3: jQuery.fn/ jQuery.prototype
  6. 【转】通知 Toast详细用法(显示view)
  7. 转:微博CacheService架构浅析
  8. 【计算几何初步-凸包-Jarvis步进法。】【HDU1392】Surround the Trees
  9. mongodb 导入数据库文件
  10. Sublime Text 3 全程详细图文原创教程
  11. 通过Javascript调用微软认知服务情感检测接口的两种实现方式
  12. duilib基本框架
  13. Hyperledger Fabric Read-Write set semantics——读写集
  14. Log4j1.2配置详解
  15. javaweb c3p0连接oracle12c
  16. springboot2 pagehelper 使用笔记
  17. linux常用命令简述
  18. python 对 sqlite3的简单使用
  19. Outlook Error: The Delegates settings were not saved correctly. Cannot activate send-on-behalf-of list.
  20. java正则表达式 需要转义的字符

热门文章

  1. DIV+CSS 网页布局之:一列布局
  2. pdo如何防止 sql注入
  3. POJ 2442 Sequence 优先队列
  4. angular分页指令
  5. iOS - instancetype
  6. c/c++多级指针
  7. [OI笔记] 最长上升子序列与网络流建模
  8. 【20161030la 】总结
  9. 编译Firebird的源码
  10. AsyncHttpClient 开源框架學習研究