/* Copyright 2011-2020 David Robillard Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #ifndef SERD_STRING_UTILS_H #define SERD_STRING_UTILS_H #include #include #include /** Unicode replacement character in UTF-8 */ static const uint8_t replacement_char[] = {0xEF, 0xBF, 0xBD}; /** Return true if `c` lies within [`min`...`max`] (inclusive) */ static inline bool in_range(const int c, const int min, const int max) { return (c >= min && c <= max); } /** RFC2234: ALPHA ::= %x41-5A / %x61-7A ; A-Z / a-z */ static inline bool is_alpha(const int c) { return in_range(c, 'A', 'Z') || in_range(c, 'a', 'z'); } /** RFC2234: DIGIT ::= %x30-39 ; 0-9 */ static inline bool is_digit(const int c) { return in_range(c, '0', '9'); } /* RFC2234: HEXDIG ::= DIGIT / "A" / "B" / "C" / "D" / "E" / "F" */ static inline bool is_hexdig(const int c) { return is_digit(c) || in_range(c, 'A', 'F'); } /* Turtle / JSON / C: XDIGIT ::= DIGIT / A-F / a-f */ static inline bool is_xdigit(const int c) { return is_hexdig(c) || in_range(c, 'a', 'f'); } static inline bool is_space(const char c) { switch (c) { case ' ': case '\f': case '\n': case '\r': case '\t': case '\v': return true; default: return false; } } static inline bool is_print(const int c) { return c >= 0x20 && c <= 0x7E; } static inline bool is_windows_path(const char* path) { return is_alpha(path[0]) && (path[1] == ':' || path[1] == '|') && (path[2] == '/' || path[2] == '\\'); } static inline char serd_to_upper(const char c) { return (char)((c >= 'a' && c <= 'z') ? c - 32 : c); } static inline int serd_strncasecmp(const char* s1, const char* s2, size_t n) { for (; n > 0 && *s2; s1++, s2++, --n) { if (serd_to_upper(*s1) != serd_to_upper(*s2)) { return ((*(const uint8_t*)s1 < *(const uint8_t*)s2) ? -1 : +1); } } return 0; } static inline uint32_t utf8_num_bytes(const uint8_t leading) { static const uint8_t lengths[32] = { 1u, // 00000xxx 1u, // 00001xxx 1u, // 00010xxx 1u, // 00011xxx 1u, // 00100xxx 1u, // 00101xxx 1u, // 00110xxx 1u, // 00111xxx 1u, // 01000xxx 1u, // 01001xxx 1u, // 01010xxx 1u, // 01011xxx 1u, // 01100xxx 1u, // 01101xxx 1u, // 01110xxx 1u, // 01111xxx 0u, // 10000xxx 0u, // 10001xxx 0u, // 10010xxx 0u, // 10011xxx 0u, // 10100xxx 0u, // 10101xxx 0u, // 10110xxx 0u, // 10111xxx 2u, // 11000xxx 2u, // 11001xxx 2u, // 11010xxx 2u, // 11011xxx 3u, // 11100xxx 3u, // 11101xxx 4u, // 11110xxx 0u // 11111xxx }; return lengths[leading >> 3u]; } static inline unsigned utf8_num_bytes_for_codepoint(const uint32_t code) { if (code < 0x00000080) { return 1u; } if (code < 0x00000800) { return 2u; } if (code < 0x00010000) { return 3u; } if (code < 0x00110000) { return 4u; } return 0u; // Out of range } /// Return the code point of a UTF-8 character with known length static inline uint32_t parse_counted_utf8_char(const uint8_t* const utf8, const size_t size) { uint32_t c = utf8[0] & ((1u << (8u - size)) - 1u); for (size_t i = 1; i < size; ++i) { c = (c << 6) | (utf8[i] & 0x3Fu); } return c; } /// Parse a UTF-8 character, set *size to the length, and return the code point static inline uint32_t parse_utf8_char(const uint8_t* const utf8, size_t* const size) { switch (*size = utf8_num_bytes(utf8[0])) { case 1: case 2: case 3: case 4: return parse_counted_utf8_char(utf8, *size); default: *size = 0; return 0u; } } #endif // SERD_STRING_UTILS_H