diff --git a/src/pegparser.cpp b/src/pegparser.cpp index 29ff6e58..b92858ac 100644 --- a/src/pegparser.cpp +++ b/src/pegparser.cpp @@ -396,6 +396,131 @@ QVector PegParser::parseImageRegions(const QSharedPointer 00000yyy xxxxxxxx + unsigned char uch2 = *(p_ch + 1); + p_codePoint = ((uch & 0x1CL) << 6) + ((uch & 0x3L) << 6) + (uch2 & 0x3FL); + return 2; + } else if ((uch & 0xF0) == 0xE0) { + // 1110yyyy 10yyyyxx 10xxxxxx -> yyyyyyyy xxxxxxxx + unsigned char uch2 = *(p_ch + 1); + unsigned char uch3 = *(p_ch + 2); + p_codePoint = ((uch & 0xF) << 12) + + ((uch2 & 0x3CL) << 6) + ((uch2 & 0x3L) << 6) + + (uch3 & 0x3FL); + return 3; + } else if ((uch & 0xF8) == 0xF0) { + // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx -> 000zzzzz yyyyyyyy xxxxxxxx + unsigned char uch2 = *(p_ch + 1); + unsigned char uch3 = *(p_ch + 2); + unsigned char uch4 = *(p_ch + 3); + p_codePoint = ((uch & 0x7L) << 18) + + ((uch2 & 0x30L) << 12) + ((uch2 & 0xFL) << 12) + + ((uch3 & 0x3CL) << 6) + ((uch3 & 0x3L) << 6) + + (uch4 & 0x3FL); + return 4; + } else { + return -1; + } +} + +static inline void copyChars(char *p_dest, const char *p_src, int p_num) +{ + for (int i = 0; i < p_num; ++i) { + *(p_dest + i) = *(p_src + i); + } +} + +// @p_data: UTF-8 data array. +// If @p_data contain unicode characters with code value above 65535, it will break +// it into two characters with code value below 65536. +// Return null if there is no fix. Otherwise, return a fixed copy of the data. +static QSharedPointer tryFixUnicodeData(const char *p_data) +{ + bool needFix = false; + int sz = 0; + + const char *ch = p_data; + bool hasBOM = false; + if (HAS_UTF8_BOM(ch)) { + hasBOM = true; + ch += 3; + sz += 3; + } + + // Calculate the size of fixed data. + while (*ch != '\0') { + int cp; + int nr = utf8CodePoint(ch, cp); + if (nr == -1) { + return NULL; + } + + if (cp > MAX_CODE_POINT) { + needFix = true; + ch += nr; + // Use two one-byte chars to replace. + sz += 2; + } else { + ch += nr; + sz += nr; + } + } + + if (!needFix) { + return NULL; + } + + // Replace those chars with two one-byte chars. + QSharedPointer res(new char[sz + 1]); + char *newChar = res.data(); + int idx = 0; + ch = p_data; + if (hasBOM) { + copyChars(newChar + idx, ch, 3); + ch += 3; + idx += 3; + } + + while (*ch != '\0') { + int cp; + int nr = utf8CodePoint(ch, cp); + Q_ASSERT(nr > 0); + if (cp > MAX_CODE_POINT) { + *(newChar + idx) = X_CHAR; + *(newChar + idx + 1) = X_CHAR; + ch += nr; + idx += 2; + } else { + copyChars(newChar + idx, ch, nr); + ch += nr; + idx += nr; + } + } + + Q_ASSERT(idx == sz); + *(newChar + sz) = '\0'; + + return res; +} + pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer &p_config) { if (p_config->m_data.isEmpty()) { @@ -403,7 +528,19 @@ pmh_element **PegParser::parseMarkdownToElements(const QSharedPointerm_data is encoding in UTF-8. + // QString stores a string of 16-bit QChars. Unicode characters with code values above 65535 are stored using surrogate pairs, i.e., two consecutive QChars. + // Hence, a QString using two QChars to save one code value if it's above 65535, with size() + // returning 2. pmh_markdown_to_elements() will treat it at the size of 1 (expectively). + // To make it work, we split unicode characters whose code value is above 65535 into two unicode + // characters whose code value is below 65535. char *data = p_config->m_data.data(); + QSharedPointer fixedData = tryFixUnicodeData(data); + if (fixedData) { + data = fixedData.data(); + } + pmh_markdown_to_elements(data, p_config->m_extensions, &pmhResult); return pmhResult; }