diff options
Diffstat (limited to 'src/reader.c')
-rw-r--r-- | src/reader.c | 82 |
1 files changed, 53 insertions, 29 deletions
diff --git a/src/reader.c b/src/reader.c index 66dec851..97439540 100644 --- a/src/reader.c +++ b/src/reader.c @@ -136,38 +136,43 @@ readahead(SerdReader parser, uint8_t* pre, int n) return true; } +static inline unsigned +utf8_char_len(const uint8_t b0) +{ + if ((b0 & 0x80) == 0) { // Starts with `0' + return 1; + } else if ((b0 & 0xE0) == 0xC0) { // Starts with `110' + return 2; + } else if ((b0 & 0xF0) == 0xE0) { // Starts with `1110' + return 3; + } else if ((b0 & 0xF8) == 0xF0) { // Starts with `11110' + return 4; + } else { + return 0; + } +} + static inline uchar -read_utf8_char(SerdReader parser) +peek_utf8_char(SerdReader parser, unsigned* n_bytes) { - if (parser->read_head == READ_BUF_LEN) { - return error(parser, "page fault\n"); - } - const uchar c = parser->read_buf[parser->read_head++]; - switch (c) { - case '\n': ++parser->cur.line; parser->cur.col = 0; break; - default: ++parser->cur.col; - } - /*while ((byte & 0xC0) == 0x80) { - // Starts with `10', continuation byte - character += (byte & 0x7F); - byte = getc(parser->fd); - }*/ - return c; + const uint8_t b0 = parser->read_buf[parser->read_head]; + *n_bytes = 1; + return b0; } static inline uchar peek_char(SerdReader parser) { - if (parser->eof) { - return EOF; - } - return parser->read_buf[parser->read_head]; + unsigned n_bytes; + return peek_utf8_char(parser, &n_bytes); } static inline uchar eat_char(SerdReader parser, const uchar character) { - const uchar c = parser->read_buf[parser->read_head++]; + unsigned n_bytes; + const uchar c = peek_utf8_char(parser, &n_bytes); + parser->read_head += n_bytes; switch (c) { case '\0': return error(parser, "unexpected end of file\n"); case '\n': ++parser->cur.line; parser->cur.col = 0; break; @@ -265,11 +270,11 @@ push_char(SerdReader parser, Ref ref, const uchar c) stack_push(parser, 1); SerdString* const str = deref(parser, ref); ++str->n_bytes; - if ((c & 0xC0) == 0x80) { - fprintf(stderr, "PUSH WIDE CHAR %X\n", c); - } else { + if ((c & 0xC0) != 0x80) { + // Does not start with `10', start of a new character ++str->n_chars; } + assert(str->n_bytes > str->n_chars); str->buf[str->n_bytes - 2] = c; str->buf[str->n_bytes - 1] = '\0'; } @@ -307,7 +312,7 @@ read_hex(SerdReader parser) { const uchar c = peek_char(parser); if (in_range(c, 0x30, 0x39) || in_range(c, 0x41, 0x46)) { - return c; + return eat_char(parser, c); } else { return error(parser, "illegal hexadecimal digit `%c'\n", c); } @@ -316,12 +321,31 @@ read_hex(SerdReader parser) static inline uchar read_hex_escape(SerdReader parser, unsigned length) { - uchar ret = 0; - unsigned mult = 1; + uchar ret = 0; + uint8_t chars[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + uint8_t code[4] = { 0, 0, 0, 0 }; for (unsigned i = 0; i < length; ++i) { - const uchar c = read_hex(parser); - ret += (c * mult); - mult *= 8; + chars[i] = read_hex(parser); + } + + sscanf((const char*)chars, "%X", (uint32_t*)code); + const uint32_t code_num = *(uint32_t*)code; + if (code_num < 0x80) { + fprintf(stderr, "1 byte UTF-8 escape\n"); + return code[0]; + } else if (code_num < 0x800) { + fprintf(stderr, "2 byte UTF-8 escape\n"); + fprintf(stderr, "B0 %X\n", code[0]); + fprintf(stderr, "B1 %X\n", code[1]); + fprintf(stderr, "B2 %X\n", code[2]); + fprintf(stderr, "B3 %X\n", code[3]); + ret = ((0xC0 + ((code[3] & 0x1F) << 2) + ((code[4] & 0xC0) >> 6)) << 8) + + (code[4] & 0x3F); + fprintf(stderr, "RET %X\n", ret); + } else if (code_num < 0x10000) { + fprintf(stderr, "3 byte UTF-8 escape\n"); + } else { + fprintf(stderr, "4 byte UTF-8 escape\n"); } return ret; } |