utf8.hpp 15.2 KB
Newer Older
Monika Agarwal committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
/*************************************************************************
 *
 * Copyright 2016 Realm Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **************************************************************************/

#ifndef REALM_UTIL_UTF8_HPP
#define REALM_UTIL_UTF8_HPP

#include <cstdint>
#include <string>

#include <realm/util/safe_int_ops.hpp>
#include <realm/string_data.hpp>
#include <realm/util/features.h>
#include <realm/utilities.hpp>

namespace realm {
namespace util {


/// Transcode between UTF-8 and UTF-16.
///
/// \tparam Char16 Must be an integral type with at least 16 bits.
///
/// \tparam Traits16 Must define to_int_type() and to_char_type() for
/// \a Char16.
template <class Char16, class Traits16 = std::char_traits<Char16>>
struct Utf8x16 {
    /// Transcode as much as possible of the specified UTF-8 input, to
    /// UTF-16. Returns true if all input characters were transcoded, or
    /// transcoding stopped because the next character did not fit into the
    /// output buffer. Returns false if transcoding stopped due to invalid
    /// input. It is not specified whether this function returns true or false
    /// if invalid input occurs at the same time as the output buffer runs
    /// full. In any case, upon return, \a in_begin and \a out_begin are
    /// advanced to the position where transcoding stopped.
    ///
    /// Throws only if Traits16::to_char_type() throws.
    static bool to_utf16(const char*& in_begin, const char* in_end, Char16*& out_begin, Char16* out_end);

    /// Same as to_utf16(), but in reverse.
    ///
    /// Throws only if Traits16::to_int_type() throws.
    static bool to_utf8(const Char16*& in_begin, const Char16* in_end, char*& out_begin, char* out_end);

    /// Summarize the number of UTF-16 elements needed to hold the result of
    /// transcoding the specified UTF-8 string. Upon return, if \a in_begin !=
    /// \a in_end, then the summation stopped due to invalid UTF-8 input. The
    /// returned size then reflects the number of UTF-16 elements needed to hold
    /// the result of transcoding the part of the input that was examined. This
    /// function will only detect a few UTF-8 validity issues, and can therefore
    /// not be used for general UTF-8 validation.
    static size_t find_utf16_buf_size(const char*& in_begin, const char* in_end);

    /// Summarize the number of UTF-8 bytes needed to hold the result of
    /// transcoding the specified UTF-16 string. Upon return, if \a in_begin !=
    /// \a in_end, then the summation stopped due to invalid UTF-16 input, or to
    /// prevent the returned \c size_t value from overflowing. The returned size
    /// then reflects the number of UTF-8 bytes needed to hold the result of
    /// transcoding the part of the input that was examined. This function will
    /// only detect a few UTF-16 validity issues, and can therefore not be used
    /// for general UTF-16 validation.
    static size_t find_utf8_buf_size(const Char16*& in_begin, const Char16* in_end);
};


// Implementation:

// Adapted from reference implementation.
// http://www.unicode.org/resources/utf8.html
// http://www.bsdua.org/files/unicode.tar.gz
template <class Char16, class Traits16>
inline bool Utf8x16<Char16, Traits16>::to_utf16(const char*& in_begin, const char* const in_end, Char16*& out_begin,
                                                Char16* const out_end)
{
    typedef std::char_traits<char> traits8;
    bool invalid = false;
    const char* in = in_begin;
    Char16* out = out_begin;
    while (in != in_end) {
        if (REALM_UNLIKELY(out == out_end)) {
            break; // Need space in output buffer
        }
        REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
        uint_fast16_t v1 = uint_fast16_t(traits8::to_int_type(in[0]));
        if (REALM_LIKELY(v1 < 0x80)) { // One byte
            // UTF-8 layout: 0xxxxxxx
            *out++ = Traits16::to_char_type(v1);
            in += 1;
            continue;
        }
        if (REALM_UNLIKELY(v1 < 0xC0)) {
            invalid = true;
            break; // Invalid first byte of UTF-8 sequence
        }
        if (REALM_LIKELY(v1 < 0xE0)) { // Two bytes
            if (REALM_UNLIKELY(in_end - in < 2)) {
                invalid = true;
                break; // Incomplete UTF-8 sequence
            }
            REALM_ASSERT(&in[1] >= in_begin && &in[1] < in_end);
            uint_fast16_t v2 = uint_fast16_t(traits8::to_int_type(in[1]));
            // UTF-8 layout: 110xxxxx 10xxxxxx
            if (REALM_UNLIKELY((v2 & 0xC0) != 0x80)) {
                invalid = true;
                break; // Invalid continuation byte
            }
            uint_fast16_t v = uint_fast16_t(((v1 & 0x1F) << 6) | ((v2 & 0x3F) << 0));
            if (REALM_UNLIKELY(v < 0x80)) {
                invalid = true;
                break; // Overlong encoding is invalid
            }
            *out++ = Traits16::to_char_type(v);
            in += 2;
            continue;
        }
        if (REALM_LIKELY(v1 < 0xF0)) { // Three bytes
            if (REALM_UNLIKELY(in_end - in < 3)) {
                invalid = true;
                break; // Incomplete UTF-8 sequence
            }
            REALM_ASSERT(&in[1] >= in_begin && &in[2] < in_end);
            uint_fast16_t v2 = uint_fast16_t(traits8::to_int_type(in[1]));
            uint_fast16_t v3 = uint_fast16_t(traits8::to_int_type(in[2]));
            // UTF-8 layout: 1110xxxx 10xxxxxx 10xxxxxx
            if (REALM_UNLIKELY((v2 & 0xC0) != 0x80 || (v3 & 0xC0) != 0x80)) {
                invalid = true;
                break; // Invalid continuation byte
            }
            uint_fast16_t v = uint_fast16_t(((v1 & 0x0F) << 12) | ((v2 & 0x3F) << 6) | ((v3 & 0x3F) << 0));
            if (REALM_UNLIKELY(v < 0x800)) {
                invalid = true;
                break; // Overlong encoding is invalid
            }
            if (REALM_UNLIKELY(0xD800 <= v && v < 0xE000)) {
                invalid = true;
                break; // Illegal code point range (reserved for UTF-16 surrogate pairs)
            }
            *out++ = Traits16::to_char_type(v);
            in += 3;
            continue;
        }
        if (REALM_UNLIKELY(out + 1 == out_end)) {
            break; // Need space in output buffer for surrogate pair
        }
        if (REALM_LIKELY(v1 < 0xF8)) { // Four bytes
            if (REALM_UNLIKELY(in_end - in < 4)) {
                invalid = true;
                break; // Incomplete UTF-8 sequence
            }
            uint_fast32_t w1 = uint_fast32_t(v1); // 16 bit -> 32 bit
            REALM_ASSERT(&in[1] >= in_begin && &in[3] < in_end);
            uint_fast32_t v2 = uint_fast32_t(traits8::to_int_type(in[1])); // 32 bit intended
            uint_fast16_t v3 = uint_fast16_t(traits8::to_int_type(in[2])); // 16 bit intended
            uint_fast16_t v4 = uint_fast16_t(traits8::to_int_type(in[3])); // 16 bit intended
            // UTF-8 layout: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            if (REALM_UNLIKELY((v2 & 0xC0) != 0x80 || (v3 & 0xC0) != 0x80 || (v4 & 0xC0) != 0x80)) {
                invalid = true;
                break; // Invalid continuation byte
            }
            uint_fast32_t v = uint_fast32_t(((w1 & 0x07) << 18) | // Parenthesis is 32 bit partial result
                                            ((v2 & 0x3F) << 12) | // Parenthesis is 32 bit partial result
                                            ((v3 & 0x3F) << 6) |  // Parenthesis is 16 bit partial result
                                            ((v4 & 0x3F) << 0));  // Parenthesis is 16 bit partial result
            if (REALM_UNLIKELY(v < 0x10000)) {
                invalid = true;
                break; // Overlong encoding is invalid
            }
            if (REALM_UNLIKELY(0x110000 <= v)) {
                invalid = true;
                break; // Code point too big for UTF-16
            }
            v -= 0x10000l;
            *out++ = Traits16::to_char_type(0xD800 + (v / 0x400));
            *out++ = Traits16::to_char_type(0xDC00 + (v % 0x400));
            in += 4;
            continue;
        }
        // Invalid first byte of UTF-8 sequence, or code point too big for UTF-16
        invalid = true;
        break;
    }

    REALM_ASSERT(in >= in_begin && in <= in_end);
    REALM_ASSERT(out >= out_begin && out <= out_end);
    in_begin = in;
    out_begin = out;
    return !invalid;
}


template <class Char16, class Traits16>
inline size_t Utf8x16<Char16, Traits16>::find_utf16_buf_size(const char*& in_begin, const char* const in_end)
{
    typedef std::char_traits<char> traits8;
    size_t num_out = 0;
    const char* in = in_begin;
    while (in != in_end) {
        REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
        uint_fast16_t v1 = uint_fast16_t(traits8::to_int_type(in[0]));
        if (REALM_LIKELY(v1 < 0x80)) { // One byte
            num_out += 1;
            in += 1;
            continue;
        }
        if (REALM_UNLIKELY(v1 < 0xC0)) {
            break; // Invalid first byte of UTF-8 sequence
        }
        if (REALM_LIKELY(v1 < 0xE0)) { // Two bytes
            if (REALM_UNLIKELY(in_end - in < 2)) {
                break; // Incomplete UTF-8 sequence
            }
            num_out += 1;
            in += 2;
            continue;
        }
        if (REALM_LIKELY(v1 < 0xF0)) { // Three bytes
            if (REALM_UNLIKELY(in_end - in < 3)) {
                break; // Incomplete UTF-8 sequence
            }
            num_out += 1;
            in += 3;
            continue;
        }
        if (REALM_LIKELY(v1 < 0xF8)) { // Four bytes
            if (REALM_UNLIKELY(in_end - in < 4)) {
                break; // Incomplete UTF-8 sequence
            }
            num_out += 2; // Surrogate pair
            in += 4;
            continue;
        }
        // Invalid first byte of UTF-8 sequence, or code point too big for UTF-16
        break;
    }

    REALM_ASSERT(in >= in_begin && in <= in_end);
    in_begin = in;
    return num_out;
}


// Adapted from reference implementation.
// http://www.unicode.org/resources/utf8.html
// http://www.bsdua.org/files/unicode.tar.gz
template <class Char16, class Traits16>
inline bool Utf8x16<Char16, Traits16>::to_utf8(const Char16*& in_begin, const Char16* const in_end, char*& out_begin,
                                               char* const out_end)
{
    typedef std::char_traits<char> traits8;
    typedef typename traits8::int_type traits8_int_type;
    bool invalid = false;
    const Char16* in = in_begin;
    char* out = out_begin;
    while (in != in_end) {
        REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
        uint_fast16_t v1 = uint_fast16_t(Traits16::to_int_type(in[0]));
        if (REALM_LIKELY(v1 < 0x80)) {
            if (REALM_UNLIKELY(out == out_end)) {
                break; // Not enough output buffer space
            }
            // UTF-8 layout: 0xxxxxxx
            REALM_ASSERT(out >= out_begin && out < out_end);
            *out++ = traits8::to_char_type(traits8_int_type(v1));
            in += 1;
            continue;
        }
        if (REALM_LIKELY(v1 < 0x800)) {
            if (REALM_UNLIKELY(out_end - out < 2)) {
                break; // Not enough output buffer space
            }
            // UTF-8 layout: 110xxxxx 10xxxxxx
            *out++ = traits8::to_char_type(traits8_int_type(0xC0 + v1 / 0x40));
            REALM_ASSERT(out >= out_begin && out < out_end);
            *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 % 0x40));
            in += 1;
            continue;
        }
        if (REALM_LIKELY(v1 < 0xD800 || 0xE000 <= v1)) {
            if (REALM_UNLIKELY(out_end - out < 3)) {
                break; // Not enough output buffer space
            }
            // UTF-8 layout: 1110xxxx 10xxxxxx 10xxxxxx
            REALM_ASSERT(out >= out_begin && out + 2 < out_end);
            *out++ = traits8::to_char_type(traits8_int_type(0xE0 + v1 / 0x1000));
            *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 / 0x40 % 0x40));
            *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 % 0x40));
            in += 1;
            continue;
        }

        // Surrogate pair
        if (REALM_UNLIKELY(out_end - out < 4)) {
            break; // Not enough output buffer space
        }
        if (REALM_UNLIKELY(0xDC00 <= v1)) {
            invalid = true;
            break; // Invalid first half of surrogate pair
        }
        if (REALM_UNLIKELY(in + 1 == in_end)) {
            invalid = true;
            break; // Incomplete surrogate pair
        }
        REALM_ASSERT(&in[1] >= in_begin && &in[1] < in_end);
        uint_fast16_t v2 = uint_fast16_t(Traits16::to_int_type(in[1]));
        if (REALM_UNLIKELY(v2 < 0xDC00 || 0xE000 <= v2)) {
            invalid = true;
            break; // Invalid second half of surrogate pair
        }
        uint_fast32_t v = 0x10000l + (uint_fast32_t(v1 - 0xD800) * 0x400 + (v2 - 0xDC00));
        // UTF-8 layout: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        REALM_ASSERT(out >= out_begin && out + 3 < out_end);
        *out++ = traits8::to_char_type(traits8_int_type(0xF0 + v / 0x40000));
        *out++ = traits8::to_char_type(traits8_int_type(0x80 + v / 0x1000 % 0x40));
        *out++ = traits8::to_char_type(traits8_int_type(0x80 + v / 0x40 % 0x40));
        *out++ = traits8::to_char_type(traits8_int_type(0x80 + v % 0x40));
        in += 2;
    }

    REALM_ASSERT(in >= in_begin && in <= in_end);
    REALM_ASSERT(out >= out_begin && out <= out_end);
    in_begin = in;
    out_begin = out;
    return !invalid;
}


template <class Char16, class Traits16>
inline size_t Utf8x16<Char16, Traits16>::find_utf8_buf_size(const Char16*& in_begin, const Char16* const in_end)
{
    size_t num_out = 0;
    const Char16* in = in_begin;
    while (in != in_end) {
        REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
        uint_fast16_t v = uint_fast16_t(Traits16::to_int_type(in[0]));
        if (REALM_LIKELY(v < 0x80)) {
            if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 1)))
                break; // Avoid overflow
            in += 1;
        }
        else if (REALM_LIKELY(v < 0x800)) {
            if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 2)))
                break; // Avoid overflow
            in += 1;
        }
        else if (REALM_LIKELY(v < 0xD800 || 0xE000 <= v)) {
            if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 3)))
                break; // Avoid overflow
            in += 1;
        }
        else {
            if (REALM_UNLIKELY(in + 1 == in_end)) {
                break; // Incomplete surrogate pair
            }
            if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 4)))
                break; // Avoid overflow
            in += 2;
        }
    }
    REALM_ASSERT(in >= in_begin && in <= in_end);
    in_begin = in;
    return num_out;
}
} // namespace util
} // namespace realm

#endif // REALM_UTIL_UTF8_HPP