= "\x80" && $c < "\xc0") { // Legal tail bytes are nice. $sequence .= $c; } else { if ($len == 0) { // Premature end of string! // Drop a replacement character into output to // represent the invalid UTF-8 sequence. $result .= $unknown; break 2; } else { // Illegal tail byte; abandon the sequence. $result .= $unknown; // Back up and reprocess this byte; it may itself // be a legal ASCII or UTF-8 sequence head. --$i; ++$len; continue 2; } } } while (--$remaining); $n = ord($head); if ($n <= 0xdf) { $ord = ($n - 192) * 64 + (ord($sequence{1}) - 128); } else if ($n <= 0xef) { $ord = ($n - 224) * 4096 + (ord($sequence{1}) - 128) * 64 + (ord($sequence{2}) - 128); } else if ($n <= 0xf7) { $ord = ($n - 240) * 262144 + (ord($sequence{1}) - 128) * 4096 + (ord($sequence{2}) - 128) * 64 + (ord($sequence{3}) - 128); } else if ($n <= 0xfb) { $ord = ($n - 248) * 16777216 + (ord($sequence{1}) - 128) * 262144 + (ord($sequence{2}) - 128) * 4096 + (ord($sequence{3}) - 128) * 64 + (ord($sequence{4}) - 128); } else if ($n <= 0xfd) { $ord = ($n - 252) * 1073741824 + (ord($sequence{1}) - 128) * 16777216 + (ord($sequence{2}) - 128) * 262144 + (ord($sequence{3}) - 128) * 4096 + (ord($sequence{4}) - 128) * 64 + (ord($sequence{5}) - 128); } $result .= _transliteration_replace($ord, $unknown, $locale); $head = ''; } elseif ($c < "\x80") { // ASCII byte. $result .= $c; $head = ''; } elseif ($c < "\xc0") { // Illegal tail bytes. if ($head == '') { $result .= $unknown; } } else { // Miscellaneous freaks. $result .= $unknown; $head = ''; } } } return $result; } /** * Load the transliteration database and replace a Unicode character. * * @param $ord * A ordinal Unicode character code. * @param $unknown * Replacement string for characters that do not have a suitable ASCII * equivalent. * @param $locale * Optional ISO 639 language code that denotes the language of the input. * Used to apply language-specific optimizations. Defaults to the current * display language. * @return * ASCII replacement character. */ function _transliteration_replace($ord, $unknown = '?', $locale = NULL) { if (!isset($locale)) { global $locale; } static $map = array(), $template = array(); $bank = $ord >> 8; // Check if we need to load a new bank if (!isset($template[$bank])) { $file = drupal_get_path('module', 'transliteration') . '/data/' . sprintf('x%02x', $bank) . '.php'; if (file_exists($file)) { $template[$bank] = include ($file); } else { $template[$bank] = array('en' => array()); } } // Check if we need to create new mappings with locale specific alterations if (!isset($map[$bank][$locale])) { if ($locale != 'en' && isset($template[$bank][$locale])) { // Merge locale specific mappings with the default transliteration table $map[$bank][$locale] = $template[$bank][$locale] + $template[$bank]['en']; } else { $map[$bank][$locale] = $template[$bank]['en']; } } $ord = $ord & 255; return isset($map[$bank][$locale][$ord]) ? $map[$bank][$locale][$ord] : $unknown; }