Snippets

Joran Rood PHP UTF-8 Unicode Code Point Generator

Created by Joran Rood
<?php

/**
 * @author Joran Rood <joran@rood.me>
 */

declare(strict_types=1);

/**
 * Unpacks a UTF-8 string to a traversable series of code points.
 * A replacement code point (0xfffd) is yielded
 * for each byte that is not part of a valid code point.
 */
function utf8_unpack_string(string $string): Generator {
    $index = 0;
    $length = strlen($string);

    while ($index < $length) {
        $first_byte = ord($string[$index]); // 0x00-0xff

        if (($first_byte & 0x80) === 0x00) {
            // One-byte code point (0x00-0x7f)
            yield $index++ => $first_byte;
            continue;

        } elseif (($first_byte & 0xe0) === 0xc0) {
            // Two-byte code point (0x80-0x7ff)
            if (($index + 1) < $length) {
                $second_byte = ord($string[($index + 1)]);

                if (($second_byte & 0xc0) === 0x80) {
                    // One continuation byte
                    $code = ((($first_byte & 0x1f) << 6)
                        | ($second_byte & 0x3f));

                    if ($code >= 0x80) {
                        // Economically encoded
                        yield $index => $code;
                        $index += 2;
                        continue;
                    }
                }
            }

        } elseif (($first_byte & 0xf0) === 0xe0) {
            // Three-byte code point (0x800-0xffff)
            if (($index + 2) < $length) {
                $second_byte = ord($string[($index + 1)]);
                $third_byte = ord($string[($index + 2)]);

                if ((($second_byte & 0xc0) === 0x80)
                    && (($third_byte & 0xc0) === 0x80)) {
                    // Two continuation bytes
                    $code = ((($first_byte & 0x0f) << 12)
                        | (($second_byte & 0x3f) << 6)
                        | ($third_byte & 0x3f));

                    if (($code >= 0x800)
                        && (($code < 0xd800)
                            || ($code >= 0xe000))) {
                        /* Economically encoded valid code point
                           (0x800-0xd7ff or 0xe000-0xffff) */
                        yield $index => $code;
                        $index += 3;
                        continue;
                    }
                }
            }

        } elseif (($first_byte & 0xf8) === 0xf0) {
            // Four-byte code point (0x10000-0x1fffff)
            if (($index + 3) < $length) {
                $second_byte = ord($string[($index + 1)]);
                $third_byte = ord($string[($index + 2)]);
                $fourth_byte = ord($string[($index + 3)]);

                if ((($second_byte & 0xc0) === 0x80)
                    && (($third_byte & 0xc0) === 0x80)
                    && (($fourth_byte & 0xc0) === 0x80)) {
                    // Three continuation bytes
                    $code = ((($first_byte & 0x07) << 18)
                        | (($second_byte & 0x3f) << 12)
                        | (($third_byte & 0x3f) << 6)
                        | ($fourth_byte & 0x3f));

                    if (($code >= 0x10000)
                        && ($code < 0x110000)) {
                        /* Economically encoded valid code point
                           (0x10000-0x10ffff) */
                        yield $index => $code;
                        $index += 4;
                        continue;
                    }
                }
            }
        }

        /* Invalid byte (0xf8-0xff), misplaced continuation byte,
            incomplete code point, invalid codepoint (0xd800-0xdfff
            or 0x110000-0x1fffff) or overlong encoding */
        yield $index++ => 0xfffd;
    }
}

/**
 * Packs a traversable series of code points into a UTF-8 string.
 * A replacement code point (0xfffd) is packed for each invalid code point.
 */
function utf8_pack_string(Traversable $codes): string {
    $string = '';

    foreach ($codes as $code) {
        if (($code < 0x00)
            || ($code >= 0x110000)) {
            /* Replace negative and otherwise
                invalid code points (0x110000-) */
            $code = 0xfffd;
        }

        if ($code < 0x80) {
            // One-byte code point (0x00-0x7f)
            $string .= chr($code);

        } elseif ($code < 0x800) {
            // Two-byte code point (0x80-0x7ff)
            $string .= chr((($code >> 6) | 0xc0))
                . chr((($code & 0x3f) | 0x80));

        } elseif ($code < 0x10000) {
            if (($code >= 0xd800)
                && ($code < 0xe000)) {
                // Replace invalid code points (0xd800-0xdfff)
                $code = 0xfffd;
            }

            // Three-byte code point (0x800-0xffff)
            $string .= chr((($code >> 12) | 0xe0))
                . chr(((($code >> 6) & 0x3f) | 0x80))
                . chr((($code & 0x3f) | 0x80));

        } else {
            // Four-byte code point (0x10000-0x10ffff)
            $string .= chr((($code >> 18) | 0xf0))
                . chr(((($code >> 12) & 0x3f) | 0x80))
                . chr(((($code >> 6) & 0x3f) | 0x80))
                . chr((($code & 0x3f) | 0x80));
        }
    }

    return $string;
}

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.