- 论坛徽章:
- 0
|
std::string chineseCharDecode(const std::string& str){
std::istringstream iss(str);
std::ostringstream oss;
char c;
int v;
///一个个字符的读入,寻找"&#[0-9]+;"这样的模式
while(1){
iss>>c;
if(!iss) break;
if(c=='&'){
iss>>c;
if(!iss) break;
if(c=='#'){
iss>>v;
if(!iss){
///&#后面读入的不是数字
oss<<"&#";
iss.clear();
continue;
}
iss>>c;
if(!iss){
///ആ这样的后面就没了。也没有分号
oss<<"&#"<<v;
break;
}
if(c==';'){
///匹配正确
try{
oss<<convChar(v,getDefaultEncoding<std::string>().c_str(),"UTF-16");
}
catch(...){
oss<<"&#"<<v<<c<<";";
}
}
else{
oss<<"&#"<<v<<c;
}
}
else {
oss<<'&'<<c;
}
}
else oss<<c;
}
return oss.str();
}
std::string convChar(uint32_t val,const char* encTo,const char* encFrom){
iconv_t pt=iconv_open(encTo,encFrom);
if(pt== (iconv_t)-1){
std::cout<<"iconv_open 失败"<<std::endl;
return "";
}
iconv(pt,NULL,NULL,NULL,NULL);
uint8_t x[8];
uint8_t* p=x;
*(reinterpret_cast<uint16_t*>(p))=0xfeff;
p+=2;
*(reinterpret_cast<uint32_t*>(p))=val;
p+=4;
*(reinterpret_cast<uint16_t*>(p))=0;
const char * str=(const char*)x;
char* out=new char[100];
size_t b=100;
const char** in=&str;
size_t lenin=sizeof(x);
char* buf=out;
int ret=iconv(pt,in,&lenin,&out,&b);
if(ret){
// std::cout<<"转换失败\n";
switch(errno){
case E2BIG:
delete[] buf;
THROW("There is not sufficient room at *outbuf.\n");
break;
case EILSEQ:
delete[] buf;
THROW(" An invalid multibyte sequence has "
"been encountered in the input.\n");
break;
case EINVAL:
delete[] buf;
THROW(" An incomplete multibyte sequence has been encountered in the input.\n");
break;
default:
delete[] buf;
THROW("未知的errno\n");
}
}
iconv_close(pt);
std::string retval(buf);
delete[] buf;
return retval;
} |
|