PegParser: fix Unicode characters above 65535

QString() will use two characters to save one Unicode chartacter if its
code point is above 65535, which causes the parse result shift.
This commit is contained in:
Le Tan 2018-10-15 19:39:28 +08:00
parent 29079ade72
commit 8dbcf139d8

View File

@ -396,6 +396,131 @@ QVector<VElementRegion> PegParser::parseImageRegions(const QSharedPointer<PegPar
return regs; return regs;
} }
#define MAX_CODE_POINT 65535
#define X_CHAR 86U
#define HAS_UTF8_BOM(x) ( ((*x & 0xFF) == 0xEF)\
&& ((*(x+1) & 0xFF) == 0xBB)\
&& ((*(x+2) & 0xFF) == 0xBF) )
// Calculate the UTF8 code point.
// Return the number of chars consumed.
static inline int utf8CodePoint(const char *p_ch, int &p_codePoint)
{
unsigned char uch = *p_ch;
if ((uch & 0x80) == 0) {
p_codePoint = uch;
return 1;
} else if ((uch & 0xE0) == 0xC0) {
// 110yyyxx 10xxxxxx -> 00000yyy xxxxxxxx
unsigned char uch2 = *(p_ch + 1);
p_codePoint = ((uch & 0x1CL) << 6) + ((uch & 0x3L) << 6) + (uch2 & 0x3FL);
return 2;
} else if ((uch & 0xF0) == 0xE0) {
// 1110yyyy 10yyyyxx 10xxxxxx -> yyyyyyyy xxxxxxxx
unsigned char uch2 = *(p_ch + 1);
unsigned char uch3 = *(p_ch + 2);
p_codePoint = ((uch & 0xF) << 12)
+ ((uch2 & 0x3CL) << 6) + ((uch2 & 0x3L) << 6)
+ (uch3 & 0x3FL);
return 3;
} else if ((uch & 0xF8) == 0xF0) {
// 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx -> 000zzzzz yyyyyyyy xxxxxxxx
unsigned char uch2 = *(p_ch + 1);
unsigned char uch3 = *(p_ch + 2);
unsigned char uch4 = *(p_ch + 3);
p_codePoint = ((uch & 0x7L) << 18)
+ ((uch2 & 0x30L) << 12) + ((uch2 & 0xFL) << 12)
+ ((uch3 & 0x3CL) << 6) + ((uch3 & 0x3L) << 6)
+ (uch4 & 0x3FL);
return 4;
} else {
return -1;
}
}
static inline void copyChars(char *p_dest, const char *p_src, int p_num)
{
for (int i = 0; i < p_num; ++i) {
*(p_dest + i) = *(p_src + i);
}
}
// @p_data: UTF-8 data array.
// If @p_data contain unicode characters with code value above 65535, it will break
// it into two characters with code value below 65536.
// Return null if there is no fix. Otherwise, return a fixed copy of the data.
static QSharedPointer<char> tryFixUnicodeData(const char *p_data)
{
bool needFix = false;
int sz = 0;
const char *ch = p_data;
bool hasBOM = false;
if (HAS_UTF8_BOM(ch)) {
hasBOM = true;
ch += 3;
sz += 3;
}
// Calculate the size of fixed data.
while (*ch != '\0') {
int cp;
int nr = utf8CodePoint(ch, cp);
if (nr == -1) {
return NULL;
}
if (cp > MAX_CODE_POINT) {
needFix = true;
ch += nr;
// Use two one-byte chars to replace.
sz += 2;
} else {
ch += nr;
sz += nr;
}
}
if (!needFix) {
return NULL;
}
// Replace those chars with two one-byte chars.
QSharedPointer<char> res(new char[sz + 1]);
char *newChar = res.data();
int idx = 0;
ch = p_data;
if (hasBOM) {
copyChars(newChar + idx, ch, 3);
ch += 3;
idx += 3;
}
while (*ch != '\0') {
int cp;
int nr = utf8CodePoint(ch, cp);
Q_ASSERT(nr > 0);
if (cp > MAX_CODE_POINT) {
*(newChar + idx) = X_CHAR;
*(newChar + idx + 1) = X_CHAR;
ch += nr;
idx += 2;
} else {
copyChars(newChar + idx, ch, nr);
ch += nr;
idx += nr;
}
}
Q_ASSERT(idx == sz);
*(newChar + sz) = '\0';
return res;
}
pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseConfig> &p_config) pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseConfig> &p_config)
{ {
if (p_config->m_data.isEmpty()) { if (p_config->m_data.isEmpty()) {
@ -403,7 +528,19 @@ pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseCo
} }
pmh_element **pmhResult = NULL; pmh_element **pmhResult = NULL;
// p_config->m_data is encoding in UTF-8.
// QString stores a string of 16-bit QChars. Unicode characters with code values above 65535 are stored using surrogate pairs, i.e., two consecutive QChars.
// Hence, a QString using two QChars to save one code value if it's above 65535, with size()
// returning 2. pmh_markdown_to_elements() will treat it at the size of 1 (expectively).
// To make it work, we split unicode characters whose code value is above 65535 into two unicode
// characters whose code value is below 65535.
char *data = p_config->m_data.data(); char *data = p_config->m_data.data();
QSharedPointer<char> fixedData = tryFixUnicodeData(data);
if (fixedData) {
data = fixedData.data();
}
pmh_markdown_to_elements(data, p_config->m_extensions, &pmhResult); pmh_markdown_to_elements(data, p_config->m_extensions, &pmhResult);
return pmhResult; return pmhResult;
} }