utf8_to_unicode

Uncategorized

October 2007

http://www.randomchaos.com/documents/?source=php_and_unicode

function utf8_to_unicode( $str ) { // two functions below are used as workaround for line break replacement, replace with proper
//        $wraph4 = str_replace( “cannot find this symbol”, ” “, $wraph4);  // –  %e2%80%a8   HTML Entity (decimal)      
 HTML Entity (hex)     
 http://www.fileformat.info/info/unicode/char/2028/index.htm    replace line break from InDesign with whitespace
$unicode = array();
$values = array();
$lookingFor = 1;

for ($i = 0; $i < strlen( $str ); $i++ ) {
$thisValue = ord( $str[ $i ] );
if ( $thisValue < 128 ) $unicode[] = $thisValue;
else {
if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
$values[] = $thisValue;
if ( count( $values ) == $lookingFor ) {
$number = ( $lookingFor == 3 ) ?
( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );

$unicode[] = $number;
$values = array();
$lookingFor = 1;
} // if
} // if
} // for
return $unicode;
} // utf8_to_unicode