Source

libtcu / utf8z.h

Full commit
/*	Copyright 2012 Christoph Gärtner
	Distributed under the Boost Software License, Version 1.0
*/

#ifndef UTF8Z_H_
#define UTF8Z_H_

#include <stddef.h>
#include <stdint.h>

#ifndef UTF8Z_INLINE
#define UTF8Z_INLINE inline
#endif

enum
{
	UTF8Z_BC_LEADING		= 1 << 0,
	UTF8Z_BC_CONTINUATION	= 1 << 1,
	UTF8Z_BC_ZERO			= 1 << 2,

	UTF8Z_CC_CHAR			= 1 << 3,
	UTF8Z_CC_LOW_SURROGATE	= 1 << 4,
	UTF8Z_CC_HIGH_SURROGATE	= 1 << 5,
	UTF8Z_CC_NONCHAR		= 1 << 6,
	UTF8Z_CC_OVERFLOW		= 1 << 7,

	UTF8Z_SC_SINGLE			= 1 << 8,
	UTF8Z_SC_DOUBLE			= 1 << 9,
	UTF8Z_SC_TRIPLE			= 1 << 10,
	UTF8Z_SC_QUAD			= 1 << 11
};

UTF8Z_INLINE unsigned utf8z_classify_byte(uint8_t byte);

UTF8Z_INLINE unsigned utf8z_classify_codepoint(uint32_t cp);

UTF8Z_INLINE void *utf8z_encode(void *bytes, uint32_t cp);

UTF8Z_INLINE const void *utf8z_decode(
	uint32_t *restrict cp, const void *restrict bytes);

UTF8Z_INLINE uint32_t utf8z_decode_byte(uint32_t cp, uint8_t byte);

UTF8Z_INLINE uint32_t utf8z_decode_classified_byte(
	uint32_t cp, unsigned bc, uint8_t byte);

// HERE BE DRAGONS

UTF8Z_INLINE unsigned utf8z_classify_byte(uint8_t byte)
{
	if(byte == 0)
		return UTF8Z_BC_ZERO;

	if(byte >> 7 == 0)
		return UTF8Z_BC_LEADING | UTF8Z_SC_SINGLE;

	if(byte >> 6 == 2)
		return UTF8Z_BC_CONTINUATION;

	if(byte >> 5 == 6)
		return UTF8Z_BC_LEADING | UTF8Z_SC_DOUBLE;

	if(byte >> 4 == 14)
		return UTF8Z_BC_LEADING | UTF8Z_SC_TRIPLE;

	if(byte >> 3 == 30)
		return UTF8Z_BC_LEADING | UTF8Z_SC_QUAD;

	return 0;
}

UTF8Z_INLINE unsigned utf8z_classify_codepoint(uint32_t cp)
{
	if(cp == 0)
		return UTF8Z_CC_CHAR | UTF8Z_SC_DOUBLE;

	if(cp <= 0x7F)
		return UTF8Z_CC_CHAR | UTF8Z_SC_SINGLE;

	if(cp <= 0x07FF)
		return UTF8Z_CC_CHAR | UTF8Z_SC_DOUBLE;

	if(0xD800 <= cp && cp <= 0xDBFF)
		return UTF8Z_CC_HIGH_SURROGATE | UTF8Z_SC_TRIPLE;

	if(0xDC00 <= cp && cp <= 0xDFFF)
		return UTF8Z_CC_LOW_SURROGATE | UTF8Z_SC_TRIPLE;

	if(0xFDD0 <= cp && cp <= 0xFDEF)
		return UTF8Z_CC_NONCHAR | UTF8Z_SC_TRIPLE;

	if(cp <= 0xFFFD)
		return UTF8Z_CC_CHAR | UTF8Z_SC_TRIPLE;

	if(cp == 0xFFFE || cp == 0xFFFF)
		return UTF8Z_CC_NONCHAR | UTF8Z_SC_TRIPLE;

	if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF))
		return UTF8Z_CC_NONCHAR | UTF8Z_SC_QUAD;

	if(cp <= 0x10FFFF)
		return UTF8Z_CC_CHAR | UTF8Z_SC_QUAD;

	if(cp <= 0x1FFFFF)
		return UTF8Z_CC_OVERFLOW | UTF8Z_SC_QUAD;

	return 0;
}

UTF8Z_INLINE void *utf8z_encode(void *bytes, uint32_t cp)
{
	unsigned cc = utf8z_classify_codepoint(cp);
	uint8_t *bp = bytes;

	if(cc & UTF8Z_SC_SINGLE)
	{
		bp[0] = (uint8_t)cp;
		return bp + 1;
	}

	if(cc & UTF8Z_SC_DOUBLE)
	{
		bp[0] = (uint8_t)((6 << 5) | (cp >> 6));
		bp[1] = (uint8_t)((2 << 6) | (cp & 0x3F));
		return bp + 2;
	}

	if(cc & UTF8Z_SC_TRIPLE)
	{
		bp[0] = (uint8_t)((14 << 4) |  (cp >> 12));
		bp[1] = (uint8_t)(( 2 << 6) | ((cp >> 6) & 0x3F));
		bp[2] = (uint8_t)(( 2 << 6) | ( cp       & 0x3F));
		return bp + 3;
	}

	if(cc & UTF8Z_SC_QUAD)
	{
		bp[0] = (uint8_t)((30 << 3) |  (cp >> 18));
		bp[1] = (uint8_t)(( 2 << 6) | ((cp >> 12) & 0x3F));
		bp[2] = (uint8_t)(( 2 << 6) | ((cp >>  6) & 0x3F));
		bp[3] = (uint8_t)(( 2 << 6) | ( cp        & 0x3F));
		return bp + 4;
	}

	return NULL;
}

UTF8Z_INLINE const void *utf8z_decode(
	uint32_t *restrict cp, const void *restrict bytes)
{
	const uint8_t *bp = bytes;
	unsigned bc = utf8z_classify_byte(*bp);

	if(bc & UTF8Z_SC_SINGLE)
	{
		*cp = *bp;
		return bp + 1;
	}

	if(bc & UTF8Z_SC_DOUBLE)
	{
		if(!(utf8z_classify_byte(bp[1]) & UTF8Z_BC_CONTINUATION))
			return NULL;

		*cp =	((uint32_t)(bp[0] & 0x1F) << 6) |
				((uint32_t)(bp[1] & 0x3F));

		return bp + 2;
	}

	if(bc & UTF8Z_SC_TRIPLE)
	{
		if(	!(utf8z_classify_byte(bp[1]) & UTF8Z_BC_CONTINUATION) ||
			!(utf8z_classify_byte(bp[2]) & UTF8Z_BC_CONTINUATION))
			return NULL;

		*cp = 	((uint32_t)(bp[0] & 0x0F) << 12) |
				((uint32_t)(bp[1] & 0x3F) <<  6) |
				((uint32_t)(bp[2] & 0x3F));

		return bp + 3;
	}

	if(bc & UTF8Z_SC_QUAD)
	{
		if(	!(utf8z_classify_byte(bp[1]) & UTF8Z_BC_CONTINUATION) ||
			!(utf8z_classify_byte(bp[2]) & UTF8Z_BC_CONTINUATION) ||
			!(utf8z_classify_byte(bp[3]) & UTF8Z_BC_CONTINUATION))
			return NULL;

		*cp =	((uint32_t)(bp[0] & 0x07) << 18) |
				((uint32_t)(bp[1] & 0x3F) << 12) |
				((uint32_t)(bp[2] & 0x3F) <<  6) |
				((uint32_t)(bp[3] & 0x3F));

		return bp + 4;
	}

	return NULL;
}

UTF8Z_INLINE uint32_t utf8z_decode_byte(uint32_t cp, uint8_t byte)
{
	return utf8z_decode_classified_byte(cp, utf8z_classify_byte(byte), byte);
}

UTF8Z_INLINE uint32_t utf8z_decode_classified_byte(
	uint32_t cp, unsigned bc, uint8_t byte)
{
	if(bc & UTF8Z_SC_SINGLE)
		return byte;

	if(bc & UTF8Z_BC_CONTINUATION)
		return cp << 6 | (byte & 0x3F);

	if(bc & UTF8Z_SC_DOUBLE)
		return byte & 0x1F;

	if(bc & UTF8Z_SC_TRIPLE)
		return byte & 0x0F;

	if(bc & UTF8Z_SC_QUAD)
		return byte & 0x07;

	return (uint32_t)-1;
}

#endif