c++ linux utf-8 编码 中文汉字分割(超简单代码)

UTF-8 编码对于英文字母,占用一个字节;
UTF-8 编码对于中文字母,占用多个字节,最大占用6个字节,其中第一个字节二进制的最高位连续1的个数来表示占用字节的个数,例如;
汉字“中”占3个字节

“中” : ‭11100100  10111000  10101101‬ // 最高位连续3个1
“国” : ‭11100101  10011011  10111101‬  // 最高位连续3个1

实现汉字的分割代码:

vector<string> list;
int strSize = str.size();
int i = 0;

while (i < strSize) {
    int len = 1;
    for (int j = 0; j < 6 && (str[i] & (0x80 >> j)); j++) {
        len = j+1;
    }
    list.push_back(str.substr(i, len));
    i += len;
}

C++ Unicode/UTF-8中文 相互转换

C++ Unicode/UTF-8中文 相互转换
需求一

中文 ”你好啊“ ———-unicode———-> \u4F60\u597D\u554A

unicode编码字符串转中文。
例: “4F60597D554A” —> “你好啊”
解决方式

需求二

unicode 字节数组转中文
例:0x89 0x7F 0x5B 0x89 0x8D 0xEF —> 西安路
解决方式

需求三 中英文转unicode

asd你好阿 —-> 0061007300644f60597d963f
解决方式

string 与 wstring 的转换


#include <codecvt>
#include <string>
std::wstring s2ws(const std::string& str{

  using convert_typeX = std::codecvt_utf8<wchar_t>;

  std::wstring_convert<convert_typeX, wchar_t> converterX;

  return converterX.from_bytes(str);

}

std::string ws2s(const std::wstring& wstr)

{

  using convert_typeX = std::codecvt_utf8<wchar_t>;

  std::wstring_convert<convert_typeX, wchar_t> converterX;

  return converterX.to_bytes(wstr);

}

C++Unicode 和 UTF-8 的转换

代码摘抄于 Stack Overflow

连接:https://stackoverflow.com/questions/12015571/how-to-print-unicode-character-in-c

我一开始的需求是需要在控制台打印进度,用连续的实心正方形来模拟进度,所以就想打印正方形(2588),所以就找到了这个帖子,顺便附上打印实心正方形的代码:

std::cout << "\u2588" << std::endl;

以下是一些转换代码:


#include <iostream>

using namespace std;


int utf8_to_unicode(string utf8_code);
string unicode_to_utf8(int unicode);


int main() {
    cout << unicode_to_utf8(36) << '\t';
    cout << unicode_to_utf8(162) << '\t';
    cout << unicode_to_utf8(8364) << '\t';
    cout << unicode_to_utf8(128578) << endl;

    cout << unicode_to_utf8(0x24) << '\t';
    cout << unicode_to_utf8(0xa2) << '\t';
    cout << unicode_to_utf8(0x20ac) << '\t';
    cout << unicode_to_utf8(0x1f642) << endl;

    cout << utf8_to_unicode("$") << '\t';
    cout << utf8_to_unicode("¢") << '\t';
    cout << utf8_to_unicode("€") << '\t';
    cout << utf8_to_unicode("?") << endl;

    cout << utf8_to_unicode("\x24") << '\t';
    cout << utf8_to_unicode("\xc2\xa2") << '\t';
    cout << utf8_to_unicode("\xe2\x82\xac") << '\t';
    cout << utf8_to_unicode("\xf0\x9f\x99\x82") << endl;

    return 0;
}

int utf8_to_unicode(string utf8_code) {
    unsigned utf8_size = utf8_code.length();
    int unicode = 0;

    for (unsigned p = 0; p < utf8_size; ++p) {
        int bit_count = (p ? 6 : 8 - utf8_size - (utf8_size == 1 ? 0 : 1)),
            shift = (p < utf8_size - 1 ? (6 * (utf8_size - p - 1)) : 0);

        for (int k = 0; k < bit_count; ++k)
            unicode += ((utf8_code[p] & (1 << k)) << shift);
    }

    return unicode;
}

string unicode_to_utf8(int unicode) {
    string s;

    if (unicode >= 0 and unicode <= 0x7f) { // 7F(16) = 127(10)
        s = static_cast<char>(unicode);

        return s;
    } else if (unicode <= 0x7ff) { // 7FF(16) = 2047(10)
        unsigned char c1 = 192, c2 = 128;

        for (int k = 0; k < 11; ++k) {
            if (k < 6)  c2 |= (unicode % 64) & (1 << k);
            else c1 |= (unicode >> 6) & (1 << (k - 6));
        }

        s = c1;    s += c2;

        return s;
    } else if (unicode <= 0xffff) { // FFFF(16) = 65535(10)
        unsigned char c1 = 224, c2 = 128, c3 = 128;

        for (int k = 0; k < 16; ++k) {
            if (k < 6)  c3 |= (unicode % 64) & (1 << k);
            else if (k < 12) c2 |= (unicode >> 6) & (1 << (k - 6));
            else c1 |= (unicode >> 12) & (1 << (k - 12));
        }

        s = c1;    s += c2;    s += c3;

        return s;
    } else if (unicode <= 0x1fffff) { // 1FFFFF(16) = 2097151(10)
        unsigned char c1 = 240, c2 = 128, c3 = 128, c4 = 128;

        for (int k = 0; k < 21; ++k) {
            if (k < 6)  c4 |= (unicode % 64) & (1 << k);
            else if (k < 12) c3 |= (unicode >> 6) & (1 << (k - 6));
            else if (k < 18) c2 |= (unicode >> 12) & (1 << (k - 12));
            else c1 |= (unicode >> 18) & (1 << (k - 18));
        }

        s = c1;    s += c2;    s += c3;    s += c4;

        return s;
    } else if (unicode <= 0x3ffffff) { // 3FFFFFF(16) = 67108863(10)
        ;  // actually, there are no 5-bytes unicodes
    } else if (unicode <= 0x7fffffff) { // 7FFFFFFF(16) = 2147483647(10)
        ;  // actually, there are no 6-bytes unicodes
    } else  ; // incorrect unicode (< 0 or > 2147483647)

    return "";
}

C++字符数字的编码(Encode)与解码(Decode)

在日常应用中,我们常用结构体或者类来存储一条信息,这种方式很方便,但是不利于数据的传输。例如在网络编程中,我们需要将结构中的数据转化为字节流才能进行传输,我们可以利用memcpy强行将结构化的数据转化为字符串,在接收方以同样的方式转化为来。此法简单易用,但是由于结构化的数据涉及到字符对齐的问题,这种方法会造成额外的数据开销,所以我们最好自己手动对结构化的数据进行编码,当然这种方法也有弊端,虽然在一定程度上节省了传输流量,但结构中的字段很多时,代码量会增大,最好编写工具自动生成一些代码。

#include <iostream>
 #include <memory.h>
 #include <string.h>
 using namespace std;
 
 #define ENCODE(buf, size, offset, data)\
     if ((NULL == buf) || ( == size))\
     {\
     return -;\
     }\
     if (offset + sizeof(data) > size)\
     {\
     return -;\
     }\
     {\
     uint8_t *p = (uint8_t*)buf;\
     p=p+offset; \
     memcpy(p,&data,sizeof(data)); \
     offset = offset + sizeof(data); \
     }
 
 #define DECODE(buf, size, offset, data)\
     if ((NULL == buf) || ( == size))\
     {\
     return -;\
     }\
     if (offset + sizeof(data) > size)\
     {\
     return -;\
     }\
     {\
     uint8_t *p = (uint8_t*)buf;\
     p=p+offset; \
     memcpy(&data,p,sizeof(data)); \
     offset = offset + sizeof(data); \
     }
 
 #define ENCODE_STR(buf, size, offset, data, length)\
     if ((NULL == buf) || ( == size) || ( >= length) )\
     {\
     return -;\
     }\
     if (offset + length > size)\
     {\
     return -;\
     }\
     {\
     uint8_t *p = (uint8_t*)buf;\
     p=p+offset; \
     memcpy(p,data,length); \
     offset = offset+ length; \
     }
 
 #define DECODE_STR(buf, size, offset, data, length)\
     if ((NULL == buf) || ( == size) || ( >= length) )\
     {\
     return -;\
     }\
     if (offset + length > size)\
     {\
     return -;\
     }\
     {\
     uint8_t *p = (uint8_t*)buf;\
     p=p+offset; \
     memcpy(data,p,length); \
     offset = offset+ length; \
     }
 
 enum{
     enmMaxMsgLength = ,
     enmMaxNameLength =
 };
 
 class Msg{
     int iAge;
     char szName[enmMaxNameLength];
     double dScore;
 public:
     Msg()
     {
 
     }
     Msg(int age,const char* name,double score):iAge(age),dScore(score)
     {
         strcpy(szName,name);
     }
     virtual ~Msg(){}
     virtual int encode(char *buf)
     {
         size_t offset = ;
         memset(buf,'',enmMaxMsgLength);
         ENCODE(buf,enmMaxMsgLength,offset,iAge);
         ENCODE_STR(buf,enmMaxMsgLength,offset,szName,enmMaxNameLength);
         ENCODE(buf,enmMaxMsgLength,offset,dScore);
         return offset;
     };
     virtual int decode(char *buf,size_t bufSize)
     {
         size_t offset = ;
         DECODE(buf,bufSize,offset,iAge);
         DECODE_STR(buf,bufSize,offset,szName,enmMaxNameLength);
         DECODE(buf,bufSize,offset,dScore);
         return offset;
     }
     void display()
     {
         cout<<iAge<<" "<<szName<<" "<<dScore<<endl;
     }
 };
 
 int main(int argc, char* argv[])
 {
     size_t offset = ;
     char buf[enmMaxMsgLength],*recv = NULL;
     Msg msg(,"hwllo world",23.69),msg1;
     msg.display();
     offset = msg.encode(buf);
     cout<<offset<<endl;
     cout<<sizeof(Msg)<<endl;
     recv = new char[offset];
     memcpy(recv,buf,offset);
     msg1.decode(recv,offset);
     msg1.display();
     return ;
 }