diff --git a/upload/config.php b/upload/config.php index 237c6150f..29ffbe551 100644 --- a/upload/config.php +++ b/upload/config.php @@ -55,10 +55,9 @@ $domain_name = 'torrentpier.me'; // Enter here your primary d $domain_name = (!empty($_SERVER['SERVER_NAME'])) ? $_SERVER['SERVER_NAME'] : $domain_name; // Increase number of revision after update -$bb_cfg['tp_version'] = '2.5 Beta'; - -$bb_cfg['tp_release_date'] = '14-11-2012'; -$bb_cfg['tp_release_state'] = 'R477'; +$bb_cfg['tp_version'] = '2.5 pre-stable'; +$bb_cfg['tp_release_date'] = '28-11-2012'; +$bb_cfg['tp_release_state'] = 'R478'; // Database $charset = 'utf8'; diff --git a/upload/includes/class.correct.php b/upload/includes/class.correct.php new file mode 100644 index 000000000..2d11556bc --- /dev/null +++ b/upload/includes/class.correct.php @@ -0,0 +1,3104 @@ + 'cosmo' (2 первых и последняя буква — ошибочные) + * "\x78\x70\x65н" => 'хрен' (первые 3 буквы — ошибочные) + * "вебvfcnth" => 'вебмастер' + * "webьфыеук" => 'webmaster' + * "цццюмуыеш.ru" => 'www.vesti.ru' + * "\x54.\x43.\x48\x61вка" => 'Т.С.Навка' + * + * Hints + * Типичный пример алгоритма работы для поля ввода с автодополнением: + * 1. Сделать выборку по исходному запросу; + * 2. Если есть результат, возвратить его и исходный запрос; + * 3. Иначе скорректировать исходный запрос через Text_LangCorrect; + * 4. Если исходный и скорректированный запрос совпадает, возвратить пустой результат и исходный запрос; + * 5. Иначе сделать выборку по скорректированному запросу; + * 6. Возвратить результат. Если результат не пустой, возвратить скорректированный запрос, иначе исходный. + * + * License + * Только для некоммерческого использования! + * + * @link http://code.google.com/p/php-lang-correct/ + * @license http://creativecommons.org/licenses/by-nc-sa/3.0/ + * @author Nasibullin Rinat + * @version 1.4.3 + */ +class Text_LangCorrect +{ + /** + * Флаг для исправления ошибочно набранных букв в словах, + * которые выглядят одинаково в разных раскладках клавиатуры. + * Алгоритм работает достаточно надёжно и быстро. + */ + const SIMILAR_CHARS = 1; + + /** + * Флаг для исправления ошибочно набранных слов в другой раскладке клавиатуры. + * Алгоритм может иногда ошибаться, работает в разы медленнее, чем SIMILAR_CHARS. + */ + const KEYBOARD_LAYOUT = 2; + + /** + * Флаг для добавления исправлений, если влючён флаг KEYBOARD_LAYOUT + * Синтаксис и пример: "(,.cn=>бюст)" + * ^ ^^ ^ + */ + const ADD_FIX = 4; + + #английский (all) + private $en = '[a-zA-Z]'; + + #английский (uppercase) + private $en_uc = '[A-Z]'; + + #английский + символы, которые м.б. набраны по ошибке в английской раскладке клавиатуры вместо русских букв (all) + private $en_sc = '[a-zA-Z\'`~<>,.:;{}\[\]"]'; + + #символы, которые м.б. набраны по ошибке в английской раскладке клавиатуры вместо русских букв + private $sc = '[\'`~<>,.:;{}\[\]"]'; + private $no_sc = '[^\'`~<>,.:;{}\[\]"]'; + + #русский + татарский (all) + private $tt = '[\xd0-\xd3][\x80-\xbf] + (?<=\xd0[\x90-\xbf\x81]|\xd1[\x80-\x8f\x91]|\xd2[\x96\x97\xa2\xa3\xae\xaf\xba\xbb]|\xd3[\x98\x99\xa8\xa9])'; + + #русский + татарский (uppercase) + private $tt_uc = '[\xd0\xd2\xd3][\x81-\xba] + (?<=\xd0[\x90-\xaf\x81]|\xd2[\x96\xa2\xae\xba]|\xd3[\x98\xa8])'; + + #русский + татарский (для фильтрованных текстов) (all) + private $tt_f = '[\xd0-\xd3][\x80-\xbf] + #комментируем для увеличения скорости, т.к. остальные символы отфильтрованы + #(?<=\xd0[\x90-\xbf\x81]|\xd1[\x80-\x8f\x91]|\xd2[\x96\x97\xa2\xa3\xae\xaf\xba\xbb]|\xd3[\x98\x99\xa8\xa9]) + '; + + #гласная (vowel) (lowercase) + private $vowel_lc = array( + 'tt' => '\xd0[\xb0\xb5\xb8\xbe]|\xd1[\x83\x8b\x8d\x8e\x8f\x91] #аеиоуыэюяё (гласные, 10 шт.) + #| \xd0[\x90\x95\x98\x9e\xa3\xab\xad\xae\xaf\x81] #АЕИОУЫЭЮЯЁ (гласные, 10 шт.) + ', + 'en' => '[aeiouy]', #латинских 6 шт. + ); + + #согласная (consonant) + графические знаки для русского языка (ъ, ь) (lowercase) + private $consonant_lc = array( + 'tt' => '\xd0[\xb1-\xb4\xb6\xb7\xb9\xba-\xbd\xbf]|\xd1[\x80\x81\x82\x84-\x89\x8a\x8c] #бвгджзйклмнпрстфхцчшщ ъь (согласные, 21+2 шт.) + #| \xd0[\x91-\x94\x96\x97\x99\x9a-\x9d\x9f-\xa2\xa4-\xa9\xaa\xac] #БВГДЖЗЙКЛМНПРСТФХЦЧШЩ ЪЬ (согласные, 21+2 шт.) + ', + 'en' => '[bcdfghjklmnpqrstvwxz]', #латинских 20 шт. + ); + + private $words_exceptions = array( + 'tt' => array( + 'трлн' => null, + 'ющенко' => null, + 'мебельград' => null, + 'дэнис' => null, + ), + 'en' => array( + 'heuer' => null, + ), + ); + + #русские буквы, похожие на англ. (uppercase) + private $ru_similar_uc = "\xd0[\x90\x92\x95\x9a\x9c\x9d\x9e\xa0-\xa3\xa5]"; + + #русские буквы, похожие на англ. (all) + private $ru_similar = "\xd0[\x90\x92\x95\x9a\x9c\x9d\x9e\xa0-\xa3\xa5\xb0\xb5\xbe]|\xd1[\x80\x81\x83\x85]"; + + #англ. буквы, похожие на русские (uppercase) + private $en_similar_uc = '[ABEKMHOPCTYX]'; + + /* + #$tt_fake = '\xd0[\xb0\xb5\xbe\x90\x92\x95\x9a\x9c\x9d\x9e\xa0\xa1\xa2\xa3\xa5]|\xd1[\x80\x81\x83\x85]'; + $tt_fake = '[\xd0\xd1][\x80-\xbe] + (?<=\xd0[\xb0\xb5\xbe\x90\x92\x95\x9a\x9c\x9d\x9e\xa0\xa1\xa2\xa3\xa5]|\xd1[\x80\x81\x83\x85])'; + $en_fake = '[aeopcyxABEKMHOPCTYX]'; + */ + + #уникальные русские буквы + /* + CASE_UPPER, case_lower + "\xd0\x81", "\xd1\x91", #Ё ё + "\xd0\x91", "\xd0\xb1", #Б б + "\xd0\x92", "\xd0\xb2", #В в + "\xd0\x93", "\xd0\xb3", #Г г + "\xd0\x94", "\xd0\xb4", #Д д + "\xd0\x96", "\xd0\xb6", #Ж ж + "\xd0\x97", "\xd0\xb7", #З з + "\xd0\x98", "\xd0\xb8", #И и + "\xd0\x99", "\xd0\xb9", #Й й + "\xd0\xba", #К к + "\xd0\x9b", "\xd0\xbb", #Л л + "\xd0\xbd", #Н н + "\xd0\x9f", "\xd0\xbf", #П п + "\xd1\x82", #Т т + "\xd0\xa4", "\xd1\x84", #Ф ф + "\xd0\xa6", "\xd1\x86", #Ц ц + "\xd0\xa7", "\xd1\x87", #Ч ч + "\xd0\xa8", "\xd1\x88", #Ш ш + "\xd0\xa9", "\xd1\x89", #Щ щ + "\xd0\xaa", "\xd1\x8a", #Ъ ъ + "\xd0\xab", "\xd1\x8b", #Ы ы + "\xd0\xac", "\xd1\x8c", #Ь ь + "\xd0\xad", "\xd1\x8d", #Э э + "\xd0\xae", "\xd1\x8e", #Ю ю + "\xd0\xaf", "\xd1\x8f", #Я я + */ + #$tt_uniq = "\xd0[\xb1-\xb4\xb6-\xbb\xbd\xbf\x81\x91-\x94\x96-\x99\x9b\x9f\xa4\xa6-\xaf]|\xd1[\x82\x84\x86-\x8f\x91]"; + private $tt_uniq = "[\xd0\xd1][\x82-\xbf] + (?<=\xd0[\xb1-\xb4\xb6-\xbb\xbd\xbf\x81\x91-\x94\x96-\x99\x9b\x9f\xa4\xa6-\xaf]|\xd1[\x82\x84\x86-\x8f\x91])"; + + #уникальные латинские буквы + /* + CASE_UPPER, case_lower + "\x42", "\x62", #B b + "\x44", "\x64", #D d + "\x46", "\x66", #F f + "\x68", #H h + "\x49", "\x69", #I i + "\x4a", "\x6a", #J j + "\x6b", #K k + "\x4c", "\x6c", #L l + "\x6d", #M m + "\x4e", "\x6e", #N n + "\x51", "\x71", #Q q + "\x52", "\x72", #R r + "\x53", "\x73", #S s + "\x74", #T t + "\x55", "\x75", #U u + "\x56", "\x76", #V v + "\x57", "\x77", #W w + "\x5a", "\x7a", #Z z + */ + private $en_uniq = "[\x42\x44\x46\x49\x4a\x4c\x4e\x51\x52\x53\x55\x57\x56\x5a\x62\x64\x66\x68\x69\x6a-\x6e\x71-\x77\x7a]"; + + private $table_flip; #array + private $words; #corrected words + private $en_correct; #string + private $tt_correct; #string + private $mode; #bool + + private $is_flip = false; + private $method = 0; + + private $table = array( + #метод 0: таблица исправления ошибочно набранных букв, которые выглядят одинаково (русский <--> английский) + 0 => array( + #lowercase #UPPERCASE + "\xd0\xb0" => 'a', "\xd0\x90" => 'A', + "\xd0\x92" => 'B', + "\xd0\xb5" => 'e', "\xd0\x95" => 'E', + "\xd0\x9a" => 'K', + "\xd0\x9c" => 'M', + "\xd0\x9d" => 'H', + "\xd0\xbe" => 'o', "\xd0\x9e" => 'O', + "\xd1\x80" => 'p', "\xd0\xa0" => 'P', + "\xd1\x81" => 'c', "\xd0\xa1" => 'C', + "\xd0\xa2" => 'T', + "\xd1\x83" => 'y', "\xd0\xa3" => 'Y', + "\xd1\x85" => 'x', "\xd0\xa5" => 'X', + ), + #метод 1: таблица исправления ошибочно набранных букв в другой раскладке клавиатуры (русский <--> английский) + 1 => array( + #CASE_UPPER #case_lower + "\xd0\x81" => '~', "\xd1\x91" => '`', #Ё ё + "\xd0\x90" => 'F', "\xd0\xb0" => 'f', #А а + "\xd0\x91" => '<', "\xd0\xb1" => ',', #Б б + "\xd0\x92" => 'D', "\xd0\xb2" => 'd', #В в + "\xd0\x93" => 'U', "\xd0\xb3" => 'u', #Г г + "\xd0\x94" => 'L', "\xd0\xb4" => 'l', #Д д + "\xd0\x95" => 'T', "\xd0\xb5" => 't', #Е е + "\xd0\x96" => ':', "\xd0\xb6" => ';', #Ж ж + "\xd0\x97" => 'P', "\xd0\xb7" => 'p', #З з + "\xd0\x98" => 'B', "\xd0\xb8" => 'b', #И и + "\xd0\x99" => 'Q', "\xd0\xb9" => 'q', #Й й + "\xd0\x9a" => 'R', "\xd0\xba" => 'r', #К к + "\xd0\x9b" => 'K', "\xd0\xbb" => 'k', #Л л + "\xd0\x9c" => 'V', "\xd0\xbc" => 'v', #М м + "\xd0\x9d" => 'Y', "\xd0\xbd" => 'y', #Н н + "\xd0\x9e" => 'J', "\xd0\xbe" => 'j', #О о + "\xd0\x9f" => 'G', "\xd0\xbf" => 'g', #П п + #CASE_UPPER #case_lower + "\xd0\xa0" => 'H', "\xd1\x80" => 'h', #Р р + "\xd0\xa1" => 'C', "\xd1\x81" => 'c', #С с + "\xd0\xa2" => 'N', "\xd1\x82" => 'n', #Т т + "\xd0\xa3" => 'E', "\xd1\x83" => 'e', #У у + "\xd0\xa4" => 'A', "\xd1\x84" => 'a', #Ф ф + "\xd0\xa5" => '{', "\xd1\x85" => '[', #Х х + "\xd0\xa6" => 'W', "\xd1\x86" => 'w', #Ц ц + "\xd0\xa7" => 'X', "\xd1\x87" => 'x', #Ч ч + "\xd0\xa8" => 'I', "\xd1\x88" => 'i', #Ш ш + "\xd0\xa9" => 'O', "\xd1\x89" => 'o', #Щ щ + "\xd0\xaa" => '}', "\xd1\x8a" => ']', #Ъ ъ + "\xd0\xab" => 'S', "\xd1\x8b" => 's', #Ы ы + "\xd0\xac" => 'M', "\xd1\x8c" => 'm', #Ь ь + "\xd0\xad" => '"', "\xd1\x8d" => "'", #Э э + "\xd0\xae" => '>', "\xd1\x8e" => '.', #Ю ю + "\xd0\xaf" => 'Z', "\xd1\x8f" => 'z', #Я я + ), + ); + + #несуществующие N-граммы для гласных букв + private $vowels3_lc = array( + 'en' => array( + 'aea' => 0, + 'aei' => 1, + 'aeo' => 2, + 'aeu' => 3, + 'aia' => 4, + 'aie' => 5, + 'aii' => 6, + 'aoi' => 7, + 'aou' => 8, + 'aue' => 9, + 'aya' => 10, + 'aye' => 11, + 'ayi' => 12, + 'ayo' => 13, + 'ayu' => 14, + 'eae' => 15, + 'eau' => 16, + 'eea' => 17, + 'eei' => 18, + 'eeu' => 19, + 'eia' => 20, + 'eiu' => 21, + 'eoi' => 22, + 'eou' => 23, + 'eya' => 24, + 'eye' => 25, + 'eyi' => 26, + 'eyo' => 27, + 'iae' => 28, + 'iai' => 29, + 'iao' => 30, + 'iau' => 31, + 'iei' => 32, + 'ieu' => 33, + 'ioa' => 34, + 'ioe' => 35, + 'iou' => 36, + 'iya' => 37, + 'oae' => 38, + 'oea' => 39, + 'oei' => 40, + 'oeo' => 41, + 'oeu' => 42, + 'oey' => 43, + 'oia' => 44, + 'oie' => 45, + 'ooe' => 46, + 'ooi' => 47, + 'oou' => 48, + 'oua' => 49, + 'oue' => 50, + 'oui' => 51, + 'oya' => 52, + 'oye' => 53, + 'oyi' => 54, + 'oyo' => 55, + 'uae' => 56, + 'uai' => 57, + 'uay' => 58, + 'uea' => 59, + 'uee' => 60, + 'uei' => 61, + 'ueo' => 62, + 'ueu' => 63, + 'uey' => 64, + 'uia' => 65, + 'uie' => 66, + 'uio' => 67, + 'uiu' => 68, + 'uoa' => 69, + 'uoi' => 70, + 'uou' => 71, + 'uoy' => 72, + 'uya' => 73, + 'uye' => 74, + 'uyi' => 75, + 'yae' => 76, + 'yao' => 77, + 'yau' => 78, + 'yea' => 79, + 'yei' => 80, + 'yeo' => 81, + 'yey' => 82, + 'yie' => 83, + 'yoi' => 84, + 'you' => 85, + 'yoy' => 86, + 'yua' => 87, + ), + 'tt' => array( + 'аау' => 0, + 'аео' => 1, + 'аеу' => 2, + 'аиа' => 3, + 'аио' => 4, + 'аиу' => 5, + 'аои' => 6, + 'ауэ' => 7, + 'аяя' => 8, + 'еаэ' => 9, + 'еее' => 10, + 'еео' => 11, + 'еоа' => 12, + 'еои' => 13, + 'еоо' => 14, + 'еую' => 15, + 'еуя' => 16, + 'еуё' => 17, + 'иау' => 18, + 'иео' => 19, + 'иие' => 20, + 'иоа' => 21, + 'иои' => 22, + 'иоу' => 23, + 'иоэ' => 24, + 'ияе' => 25, + 'ияи' => 26, + 'ияю' => 27, + 'оаэ' => 28, + 'оео' => 29, + 'оею' => 30, + 'оие' => 31, + 'оуе' => 32, + 'оуя' => 33, + 'оюе' => 34, + 'оюю' => 35, + 'ояе' => 36, + 'уео' => 37, + 'уюю' => 38, + ), + ); + + #несуществующие N-граммы для согласных букв + private $consonants4_lc = array( + 'en' => array( + 'bldg' => 0, + 'blvd' => 1, + 'bscr' => 2, + 'bstr' => 3, + 'cbcm' => 4, + 'cbft' => 5, + 'chfr' => 6, + 'chmn' => 7, + 'chsc' => 8, + 'chsh' => 9, + 'chst' => 10, + 'chth' => 11, + 'chts' => 12, + 'ckbr' => 13, + 'ckch' => 14, + 'ckcl' => 15, + 'ckdr' => 16, + 'ckgr' => 17, + 'cksc' => 18, + 'cksf' => 19, + 'cksh' => 20, + 'cksk' => 21, + 'cksl' => 22, + 'cksm' => 23, + 'cksn' => 24, + 'cksp' => 25, + 'ckst' => 26, + 'cksw' => 27, + 'ckth' => 28, + 'cktr' => 29, + 'ckwh' => 30, + 'cmps' => 31, + 'dspr' => 32, + 'dstr' => 33, + 'dthw' => 34, + 'ffsc' => 35, + 'ffsh' => 36, + 'ffsp' => 37, + 'fthl' => 38, + 'ftsm' => 39, + 'ftsp' => 40, + 'gdns' => 41, + 'ghbr' => 42, + 'ghfl' => 43, + 'ghsh' => 44, + 'ghtb' => 45, + 'ghtc' => 46, + 'ghtf' => 47, + 'ghth' => 48, + 'ghtj' => 49, + 'ghtl' => 50, + 'ghtm' => 51, + 'ghtn' => 52, + 'ghtr' => 53, + 'ghts' => 54, + 'ghtw' => 55, + 'hdbk' => 56, + 'hnst' => 57, + 'jctn' => 58, + 'khsh' => 59, + 'khst' => 60, + 'lchr' => 61, + 'ldpr' => 62, + 'ldsh' => 63, + 'ldsm' => 64, + 'ldsp' => 65, + 'ldst' => 66, + 'lfsk' => 67, + 'lfth' => 68, + 'lgth' => 69, + 'llfl' => 70, + 'llfr' => 71, + 'llph' => 72, + 'llpl' => 73, + 'llsh' => 74, + 'llsp' => 75, + 'llst' => 76, + 'lltr' => 77, + 'llwr' => 78, + 'lmcr' => 79, + 'lmsm' => 80, + 'lnrk' => 81, + 'lnsh' => 82, + 'lptr' => 83, + 'lsgr' => 84, + 'lshm' => 85, + 'lshw' => 86, + 'lstr' => 87, + 'lthf' => 88, + 'ltsf' => 89, + 'ltsh' => 90, + 'ltst' => 91, + 'mbsc' => 92, + 'mbsh' => 93, + 'mbsk' => 94, + 'mbst' => 95, + 'mddx' => 96, + 'mdnt' => 97, + 'mpbl' => 98, + 'mpgr' => 99, + 'mphl' => 100, + 'mphr' => 101, + 'mpsh' => 102, + 'mpst' => 103, + 'mptl' => 104, + 'mptn' => 105, + 'mptr' => 106, + 'mpts' => 107, + 'mscr' => 108, + 'mstr' => 109, + 'nchb' => 110, + 'nchl' => 111, + 'nchm' => 112, + 'nchn' => 113, + 'nchp' => 114, + 'nchr' => 115, + 'nchw' => 116, + 'nctl' => 117, + 'nctn' => 118, + 'ndbk' => 119, + 'ndbr' => 120, + 'ndch' => 121, + 'ndfl' => 122, + 'ndgl' => 123, + 'ndgr' => 124, + 'ndsc' => 125, + 'ndsh' => 126, + 'ndsl' => 127, + 'ndsm' => 128, + 'ndsp' => 129, + 'ndst' => 130, + 'ndsw' => 131, + 'ndth' => 132, + 'ndwr' => 133, + 'ngcr' => 134, + 'ngsg' => 135, + 'ngsh' => 136, + 'ngsm' => 137, + 'ngsp' => 138, + 'ngst' => 139, + 'ngth' => 140, + 'ngtz' => 141, + 'nksg' => 142, + 'nksh' => 143, + 'nksm' => 144, + 'nkst' => 145, + 'nsch' => 146, + 'nscr' => 147, + 'nsgr' => 148, + 'nshr' => 149, + 'nskr' => 150, + 'nspl' => 151, + 'nspr' => 152, + 'nssh' => 153, + 'nstr' => 154, + 'ntbr' => 155, + 'nthl' => 156, + 'nthr' => 157, + 'nths' => 158, + 'ntsh' => 159, + 'ntsm' => 160, + 'phth' => 161, + 'pstr' => 162, + 'pthr' => 163, + 'pths' => 164, + 'ptwr' => 165, + 'rbst' => 166, + 'rchb' => 167, + 'rchd' => 168, + 'rchl' => 169, + 'rchm' => 170, + 'rchn' => 171, + 'rchp' => 172, + 'rchw' => 173, + 'rdsh' => 174, + 'rdsm' => 175, + 'rdst' => 176, + 'rghs' => 177, + 'rkpl' => 178, + 'rksc' => 179, + 'rksh' => 180, + 'rksk' => 181, + 'rksm' => 182, + 'rksp' => 183, + 'rkst' => 184, + 'rldl' => 185, + 'rldw' => 186, + 'rlfr' => 187, + 'rmch' => 188, + 'rmst' => 189, + 'rmth' => 190, + 'rnbl' => 191, + 'rndl' => 192, + 'rnsk' => 193, + 'rnsp' => 194, + 'rnst' => 195, + 'rsch' => 196, + 'rscr' => 197, + 'rshl' => 198, + 'rshn' => 199, + 'rspr' => 200, + 'rstl' => 201, + 'rstr' => 202, + 'rsts' => 203, + 'rstw' => 204, + 'rtbr' => 205, + 'rtch' => 206, + 'rtcr' => 207, + 'rthb' => 208, + 'rthc' => 209, + 'rthd' => 210, + 'rthf' => 211, + 'rthl' => 212, + 'rthm' => 213, + 'rthq' => 214, + 'rthr' => 215, + 'rths' => 216, + 'rthw' => 217, + 'rtsh' => 218, + 'rtsm' => 219, + 'rtsp' => 220, + 'rtsw' => 221, + 'schl' => 222, + 'schm' => 223, + 'schn' => 224, + 'schw' => 225, + 'scrp' => 226, + 'sgmt' => 227, + 'shcl' => 228, + 'shkh' => 229, + 'shpr' => 230, + 'shpt' => 231, + 'shst' => 232, + 'shtr' => 233, + 'shwh' => 234, + 'smth' => 235, + 'ssrs' => 236, + 'ssst' => 237, + 'sstd' => 238, + 'sstr' => 239, + 'stcr' => 240, + 'sthm' => 241, + 'stpl' => 242, + 'stpr' => 243, + 'stsc' => 244, + 'stwr' => 245, + 'tblt' => 246, + 'tchb' => 247, + 'tchc' => 248, + 'tchd' => 249, + 'tchf' => 250, + 'tchl' => 251, + 'tchm' => 252, + 'tchp' => 253, + 'tchw' => 254, + 'thdr' => 255, + 'thsh' => 256, + 'thsk' => 257, + 'thsp' => 258, + 'thst' => 259, + 'tsch' => 260, + 'tspr' => 261, + 'tstr' => 262, + 'tthr' => 263, + 'ttsb' => 264, + 'tzkr' => 265, + 'whsl' => 266, + 'wnbr' => 267, + 'wnpl' => 268, + 'wnsf' => 269, + 'wnsh' => 270, + 'wnsm' => 271, + 'wnsp' => 272, + 'wnst' => 273, + 'wnsw' => 274, + 'wnth' => 275, + 'wntr' => 276, + 'wrnt' => 277, + 'wsfl' => 278, + 'wspr' => 279, + 'wstr' => 280, + 'xthl' => 281, + ), + 'tt' => array( + 'блзд' => 0, + 'бльд' => 1, + 'брьс' => 2, + 'бств' => 3, + 'бстр' => 4, + 'взбл' => 5, + 'взбр' => 6, + 'взгл' => 7, + 'взгр' => 8, + 'вздв' => 9, + 'вздр' => 10, + 'врвг' => 11, + 'врск' => 12, + 'вскл' => 13, + 'вскр' => 14, + 'вспл' => 15, + 'вспр' => 16, + 'вств' => 17, + 'встр' => 18, + 'всхл' => 19, + 'всхр' => 20, + 'втск' => 21, + 'вхск' => 22, + 'грск' => 23, + 'гств' => 24, + 'гтст' => 25, + 'гшпр' => 26, + 'двзд' => 27, + 'джск' => 28, + 'дрст' => 29, + 'дскр' => 30, + 'дств' => 31, + 'дстр' => 32, + 'дтск' => 33, + 'жств' => 34, + 'звзд' => 35, + 'знст' => 36, + 'зтьс' => 37, + 'йздр' => 38, + 'йкбр' => 39, + 'йльн' => 40, + 'йншт' => 41, + 'йпфр' => 42, + 'йств' => 43, + 'йстр' => 44, + 'йтск' => 45, + 'йфст' => 46, + 'йхсв' => 47, + 'йхск' => 48, + 'йхср' => 49, + 'йхст' => 50, + 'кскл' => 51, + 'кскр' => 52, + 'кспл' => 53, + 'кспр' => 54, + 'кств' => 55, + 'кстн' => 56, + 'кстр' => 57, + 'лвст' => 58, + 'лжск' => 59, + 'лльн' => 60, + 'лльс' => 61, + 'лстр' => 62, + 'лсть' => 63, + 'льгв' => 64, + 'льдж' => 65, + 'льдк' => 66, + 'льдм' => 67, + 'льдс' => 68, + 'льдф' => 69, + 'льдц' => 70, + 'льдш' => 71, + 'льдъ' => 72, + 'льдь' => 73, + 'льзк' => 74, + 'льзн' => 75, + 'льзь' => 76, + 'лькл' => 77, + 'лькн' => 78, + 'льпн' => 79, + 'льпт' => 80, + 'льск' => 81, + 'льсн' => 82, + 'льст' => 83, + 'льтк' => 84, + 'льтм' => 85, + 'льтн' => 86, + 'льтп' => 87, + 'льтр' => 88, + 'льтс' => 89, + 'льтт' => 90, + 'льтф' => 91, + 'льфр' => 92, + 'льцг' => 93, + 'льчс' => 94, + 'льшб' => 95, + 'льшк' => 96, + 'льшн' => 97, + 'льшп' => 98, + 'льшф' => 99, + 'льшь' => 100, + 'мбль' => 101, + 'мбрс' => 102, + 'мвзв' => 103, + 'мздр' => 104, + 'мств' => 105, + 'мтск' => 106, + 'нгль' => 107, + 'нгст' => 108, + 'ндгр' => 109, + 'ндск' => 110, + 'ндсп' => 111, + 'ндшп' => 112, + 'ндшт' => 113, + 'нкск' => 114, + 'нктн' => 115, + 'нктс' => 116, + 'нсгр' => 117, + 'нскм' => 118, + 'нскр' => 119, + 'нспл' => 120, + 'нств' => 121, + 'нстк' => 122, + 'нстр' => 123, + 'нтгл' => 124, + 'нтль' => 125, + 'нтрб' => 126, + 'нтрв' => 127, + 'нтрг' => 128, + 'нтрд' => 129, + 'нтрм' => 130, + 'нтрн' => 131, + 'нтрп' => 132, + 'нтрр' => 133, + 'нтрф' => 134, + 'нтск' => 135, + 'нтст' => 136, + 'нфск' => 137, + 'нцкл' => 138, + 'нцпл' => 139, + 'нькн' => 140, + 'ньск' => 141, + 'ньчж' => 142, + 'псск' => 143, + 'пств' => 144, + 'птск' => 145, + 'рбск' => 146, + 'ргпр' => 147, + 'ргск' => 148, + 'ргфл' => 149, + 'рдск' => 150, + 'рдсм' => 151, + 'рдст' => 152, + 'рздр' => 153, + 'рзть' => 154, + 'ркгр' => 155, + 'ркск' => 156, + 'рктн' => 157, + 'рльс' => 158, + 'рмск' => 159, + 'рмтр' => 160, + 'рнск' => 161, + 'рпск' => 162, + 'рсдр' => 163, + 'рсск' => 164, + 'рств' => 165, + 'рстк' => 166, + 'рстн' => 167, + 'рстр' => 168, + 'рстс' => 169, + 'рсть' => 170, + 'ртвл' => 171, + 'ртвр' => 172, + 'ртгр' => 173, + 'рткр' => 174, + 'ртпл' => 175, + 'ртпр' => 176, + 'ртск' => 177, + 'ртсм' => 178, + 'ртшк' => 179, + 'ртьф' => 180, + 'рхзв' => 181, + 'рхпл' => 182, + 'рхпр' => 183, + 'рхсв' => 184, + 'рхск' => 185, + 'рхсм' => 186, + 'рхср' => 187, + 'рхтв' => 188, + 'рхшт' => 189, + 'рщвл' => 190, + 'рьмл' => 191, + 'скск' => 192, + 'спрь' => 193, + 'сспр' => 194, + 'ссср' => 195, + 'сств' => 196, + 'сстр' => 197, + 'ссшп' => 198, + 'ствл' => 199, + 'стрс' => 200, + 'стрш' => 201, + 'стск' => 202, + 'стьб' => 203, + 'стьд' => 204, + 'стьс' => 205, + 'ськн' => 206, + 'сьмн' => 207, + 'тмст' => 208, + 'тпрр' => 209, + 'трст' => 210, + 'тскр' => 211, + 'тств' => 212, + 'тстр' => 213, + 'ттль' => 214, + 'ттск' => 215, + 'тхск' => 216, + 'фств' => 217, + 'фстр' => 218, + 'хств' => 219, + 'хстр' => 220, + 'хткл' => 221, + 'хтск' => 222, + 'хтсм' => 223, + 'цстр' => 224, + ), + ); + + #несуществующие биграммы в начале и конце слов + private $bigrams = array( + #ru + ' ёё' => 0, + ' ёа' => 0, + ' ёб' => 0, + ' ёв' => 0, + ' ёг' => 0, + ' ёд' => 0, + ' ёе' => 0, + ' ёз' => 0, + ' ёи' => 0, + ' ёй' => 0, + ' ён' => 0, + ' ёо' => 0, + ' ёп' => 0, + ' ёс' => 0, + ' ёт' => 0, + ' ёу' => 0, + ' ёф' => 0, + ' ёц' => 0, + ' ёч' => 0, + ' ёщ' => 0, + ' ёъ' => 0, + ' ёы' => 0, + ' ёь' => 0, + ' ёэ' => 0, + ' ёю' => 0, + ' ёя' => 0, + ' аё' => 0, + ' аа' => 0, + ' ае' => 0, + ' ач' => 0, + ' аъ' => 0, + ' аы' => 0, + ' аь' => 0, + ' аю' => 0, + ' ая' => 0, + ' бб' => 0, + ' бв' => 0, + ' бг' => 0, + ' бж' => 0, + ' бй' => 0, + ' бк' => 0, + ' бм' => 0, + ' бн' => 0, + ' бп' => 0, + ' бс' => 0, + ' бт' => 0, + ' бф' => 0, + ' бх' => 0, + ' бц' => 0, + ' бч' => 0, + ' бш' => 0, + ' бщ' => 0, + ' бъ' => 0, + ' вй' => 0, + ' вф' => 0, + ' вщ' => 0, + ' вэ' => 0, + ' вю' => 0, + ' гё' => 0, + ' гб' => 0, + ' гз' => 0, + ' гй' => 0, + ' гк' => 0, + ' гп' => 0, + ' гс' => 0, + ' гт' => 0, + ' гф' => 0, + ' гх' => 0, + ' гц' => 0, + ' гч' => 0, + ' гш' => 0, + ' гщ' => 0, + ' гъ' => 0, + ' гь' => 0, + ' гэ' => 0, + ' дб' => 0, + ' дг' => 0, + ' дд' => 0, + ' дй' => 0, + ' дк' => 0, + ' дп' => 0, + ' дс' => 0, + ' дт' => 0, + ' дф' => 0, + ' дх' => 0, + ' дц' => 0, + ' дч' => 0, + ' дш' => 0, + ' дъ' => 0, + ' дэ' => 0, + ' еа' => 0, + ' еб' => 0, + ' еи' => 0, + ' ео' => 0, + ' ет' => 0, + ' еу' => 0, + ' ец' => 0, + #' еш' => 0, + ' еъ' => 0, + ' еы' => 0, + ' еь' => 0, + ' еэ' => 0, + ' ея' => 0, + ' жз' => 0, + ' жй' => 0, + ' жк' => 0, + ' жл' => 0, + ' жп' => 0, + ' жс' => 0, + ' жт' => 0, + ' жф' => 0, + ' жх' => 0, + ' жц' => 0, + ' жч' => 0, + ' жш' => 0, + ' жщ' => 0, + ' жъ' => 0, + ' жы' => 0, + ' жь' => 0, + ' жэ' => 0, + #' жю' => 0, + ' жя' => 0, + ' зб' => 0, + ' зж' => 0, + ' зз' => 0, + ' зй' => 0, + ' зк' => 0, + ' зп' => 0, + ' зс' => 0, + ' зт' => 0, + ' зф' => 0, + ' зх' => 0, + ' зц' => 0, + ' зч' => 0, + ' зш' => 0, + ' зщ' => 0, + ' зъ' => 0, + ' зь' => 0, + ' зэ' => 0, + ' иё' => 0, + ' иа' => 0, + ' иф' => 0, + ' иц' => 0, + ' иъ' => 0, + ' иы' => 0, + ' иь' => 0, + ' иэ' => 0, + ' ия' => 0, + ' йё' => 0, + ' йа' => 0, + ' йб' => 0, + ' йв' => 0, + ' йг' => 0, + ' йд' => 0, + ' йж' => 0, + ' йз' => 0, + ' йи' => 0, + ' йй' => 0, + ' йк' => 0, + ' йл' => 0, + ' йм' => 0, + ' йн' => 0, + ' йп' => 0, + ' йр' => 0, + ' йс' => 0, + ' йт' => 0, + ' йу' => 0, + ' йф' => 0, + ' йх' => 0, + ' йц' => 0, + ' йч' => 0, + ' йш' => 0, + ' йщ' => 0, + ' йъ' => 0, + ' йы' => 0, + ' йь' => 0, + ' йэ' => 0, + ' йю' => 0, + ' йя' => 0, + ' кё' => 0, + ' кб' => 0, + ' кд' => 0, + ' кж' => 0, + ' кй' => 0, + ' кк' => 0, + ' кф' => 0, + ' кц' => 0, + ' кч' => 0, + ' кщ' => 0, + ' къ' => 0, + ' кя' => 0, + ' лв' => 0, + ' лд' => 0, + ' лз' => 0, + ' лй' => 0, + ' лк' => 0, + ' лл' => 0, + ' лм' => 0, + ' лн' => 0, + ' лп' => 0, + ' лр' => 0, + ' лс' => 0, + ' лт' => 0, + ' лф' => 0, + ' лх' => 0, + ' лц' => 0, + ' лч' => 0, + ' лш' => 0, + ' лщ' => 0, + ' лъ' => 0, + ' лэ' => 0, + ' мб' => 0, + ' мв' => 0, + ' мд' => 0, + ' мж' => 0, + ' мй' => 0, + ' мк' => 0, + ' мп' => 0, + ' мт' => 0, + ' мф' => 0, + ' мц' => 0, + ' мъ' => 0, + ' мь' => 0, + ' нб' => 0, + ' нв' => 0, + ' нг' => 0, + ' нд' => 0, + ' нж' => 0, + ' нз' => 0, + ' нй' => 0, + ' нк' => 0, + ' нл' => 0, + ' нм' => 0, + ' нн' => 0, + ' нп' => 0, + ' нс' => 0, + ' нт' => 0, + ' нф' => 0, + ' нх' => 0, + ' нц' => 0, + ' нч' => 0, + ' нш' => 0, + ' нщ' => 0, + ' нъ' => 0, + ' оё' => 0, + ' ои' => 0, + ' оу' => 0, + ' оъ' => 0, + ' оы' => 0, + ' оь' => 0, + ' оэ' => 0, + ' оя' => 0, + ' пб' => 0, + ' пв' => 0, + ' пг' => 0, + ' пд' => 0, + ' пж' => 0, + ' пз' => 0, + ' пй' => 0, + ' пк' => 0, + ' пм' => 0, + ' пп' => 0, + ' пц' => 0, + ' пщ' => 0, + ' пъ' => 0, + ' рб' => 0, + ' рг' => 0, + ' рз' => 0, + ' рй' => 0, + ' рк' => 0, + ' рл' => 0, + ' рм' => 0, + ' рн' => 0, + ' рп' => 0, + ' рр' => 0, + ' рф' => 0, + ' рх' => 0, + ' рч' => 0, + ' рш' => 0, + ' рщ' => 0, + ' ръ' => 0, + ' сй' => 0, + ' сщ' => 0, + ' тб' => 0, + ' тг' => 0, + ' тд' => 0, + ' тж' => 0, + ' тз' => 0, + ' тй' => 0, + ' тн' => 0, + ' тт' => 0, + ' тх' => 0, + ' тц' => 0, + ' тч' => 0, + ' тш' => 0, + ' тъ' => 0, + ' уу' => 0, + ' уъ' => 0, + ' уы' => 0, + ' уь' => 0, + ' фб' => 0, + ' фв' => 0, + ' фг' => 0, + ' фд' => 0, + ' фж' => 0, + ' фз' => 0, + ' фй' => 0, + ' фк' => 0, + ' фм' => 0, + ' фн' => 0, + ' фп' => 0, + ' фс' => 0, + ' фх' => 0, + ' фц' => 0, + ' фч' => 0, + ' фш' => 0, + ' фщ' => 0, + ' фъ' => 0, + ' фэ' => 0, + ' фя' => 0, + ' хё' => 0, + ' хб' => 0, + ' хг' => 0, + ' хд' => 0, + ' хж' => 0, + ' хз' => 0, + ' хй' => 0, + ' хк' => 0, + ' хп' => 0, + ' хс' => 0, + ' хт' => 0, + ' хф' => 0, + ' хц' => 0, + ' хч' => 0, + ' хш' => 0, + ' хщ' => 0, + ' хъ' => 0, + ' хы' => 0, + ' хь' => 0, + #' хэ' => 0, + ' хю' => 0, + ' хя' => 0, + ' цё' => 0, + ' цб' => 0, + ' цг' => 0, + ' цд' => 0, + ' цж' => 0, + ' цй' => 0, + ' цл' => 0, + ' цм' => 0, + ' цн' => 0, + ' цп' => 0, + ' цр' => 0, + ' цс' => 0, + ' цт' => 0, + ' цф' => 0, + ' цх' => 0, + ' цц' => 0, + ' цч' => 0, + ' цш' => 0, + ' цщ' => 0, + ' цъ' => 0, + ' ць' => 0, + ' цэ' => 0, + ' цю' => 0, + ' ця' => 0, + ' чб' => 0, + ' чг' => 0, + ' чд' => 0, + ' чж' => 0, + ' чз' => 0, + ' чй' => 0, + ' чн' => 0, + ' чп' => 0, + ' чс' => 0, + ' чф' => 0, + ' чц' => 0, + ' чч' => 0, + ' чщ' => 0, + ' чъ' => 0, + ' чы' => 0, + ' чэ' => 0, + ' чю' => 0, + ' чя' => 0, + ' шб' => 0, + ' шг' => 0, + ' шд' => 0, + ' шж' => 0, + ' шз' => 0, + ' шй' => 0, + ' шс' => 0, + ' шф' => 0, + ' шц' => 0, + ' шч' => 0, + ' шщ' => 0, + ' шъ' => 0, + ' шы' => 0, + ' шэ' => 0, + ' шю' => 0, + ' шя' => 0, + ' щб' => 0, + ' щв' => 0, + ' щг' => 0, + ' щд' => 0, + ' щж' => 0, + ' щз' => 0, + ' щй' => 0, + ' щк' => 0, + ' щл' => 0, + ' щм' => 0, + ' щн' => 0, + ' що' => 0, + ' щп' => 0, + ' щр' => 0, + ' щс' => 0, + ' щт' => 0, + ' щф' => 0, + ' щх' => 0, + ' щц' => 0, + ' щч' => 0, + ' щш' => 0, + ' щщ' => 0, + ' щъ' => 0, + ' щы' => 0, + ' щь' => 0, + ' щэ' => 0, + ' щю' => 0, + ' щя' => 0, + ' ъё' => 0, + ' ъа' => 0, + ' ъб' => 0, + ' ъв' => 0, + ' ъг' => 0, + ' ъд' => 0, + ' ъе' => 0, + ' ъж' => 0, + ' ъз' => 0, + ' ъи' => 0, + ' ъй' => 0, + ' ък' => 0, + ' ъл' => 0, + ' ъм' => 0, + ' ън' => 0, + ' ъо' => 0, + ' ъп' => 0, + ' ър' => 0, + ' ъс' => 0, + ' ът' => 0, + ' ъу' => 0, + ' ъф' => 0, + ' ъх' => 0, + ' ъц' => 0, + ' ъч' => 0, + ' ъш' => 0, + ' ъщ' => 0, + ' ъъ' => 0, + ' ъы' => 0, + ' ъь' => 0, + ' ъэ' => 0, + ' ъю' => 0, + ' ъя' => 0, + ' ыё' => 0, + ' ыа' => 0, + ' ыб' => 0, + ' ыв' => 0, + ' ыг' => 0, + ' ыд' => 0, + ' ые' => 0, + ' ыж' => 0, + ' ыз' => 0, + ' ыи' => 0, + ' ый' => 0, + ' ык' => 0, + ' ыл' => 0, + ' ын' => 0, + ' ыо' => 0, + ' ып' => 0, + ' ыр' => 0, + ' ыс' => 0, + ' ыт' => 0, + ' ыу' => 0, + ' ыф' => 0, + ' ых' => 0, + ' ыц' => 0, + ' ыч' => 0, + ' ыш' => 0, + ' ыщ' => 0, + ' ыъ' => 0, + ' ыы' => 0, + ' ыь' => 0, + ' ыэ' => 0, + ' ыю' => 0, + ' ыя' => 0, + ' ьё' => 0, + ' ьа' => 0, + ' ьб' => 0, + ' ьв' => 0, + ' ьг' => 0, + ' ьд' => 0, + ' ье' => 0, + ' ьж' => 0, + ' ьз' => 0, + ' ьи' => 0, + ' ьй' => 0, + ' ьк' => 0, + ' ьл' => 0, + ' ьм' => 0, + ' ьн' => 0, + ' ьо' => 0, + ' ьп' => 0, + ' ьр' => 0, + ' ьс' => 0, + ' ьт' => 0, + ' ьу' => 0, + ' ьф' => 0, + ' ьх' => 0, + ' ьц' => 0, + ' ьч' => 0, + ' ьш' => 0, + ' ьщ' => 0, + ' ьъ' => 0, + ' ьы' => 0, + ' ьь' => 0, + ' ьэ' => 0, + ' ью' => 0, + ' ья' => 0, + ' эё' => 0, + ' эа' => 0, + ' эе' => 0, + ' эи' => 0, + ' эц' => 0, + ' эч' => 0, + ' эщ' => 0, + ' эъ' => 0, + ' эы' => 0, + ' эь' => 0, + ' ээ' => 0, + ' эю' => 0, + ' юё' => 0, + ' юе' => 0, + ' юи' => 0, + ' юй' => 0, + ' юо' => 0, + ' юу' => 0, + ' юц' => 0, + ' юш' => 0, + ' ющ' => 0, + ' юъ' => 0, + ' юы' => 0, + ' юь' => 0, + ' юэ' => 0, + ' юя' => 0, + ' яё' => 0, + ' яа' => 0, + ' яе' => 0, + ' яж' => 0, + ' яо' => 0, + ' яу' => 0, + ' яф' => 0, + ' яц' => 0, + ' яъ' => 0, + ' яы' => 0, + ' яь' => 0, + ' яэ' => 0, + ' яю' => 0, + ' яя' => 0, + 'ёё' => 0, + 'ёё ' => 0, + 'ёа' => 0, + 'ёа ' => 0, + 'ёг ' => 0, + 'ёе' => 0, + 'ёе ' => 0, + 'ёи' => 0, + 'ёи ' => 0, + 'ёй' => 0, + 'ёо' => 0, + 'ёо ' => 0, + 'ёу' => 0, + 'ёу ' => 0, + 'ёф' => 0, + 'ёф ' => 0, + 'ёц ' => 0, + 'ёч ' => 0, + 'ёщ ' => 0, + 'ёъ' => 0, + 'ёъ ' => 0, + 'ёы' => 0, + 'ёы ' => 0, + 'ёь' => 0, + 'ёь ' => 0, + 'ёэ' => 0, + 'ёэ ' => 0, + 'ёю' => 0, + 'ёя' => 0, + 'ёя ' => 0, + 'аё ' => 0, + 'аа ' => 0, + 'аъ' => 0, + 'аъ ' => 0, + 'аы' => 0, + 'аы ' => 0, + 'аь' => 0, + 'аь ' => 0, + 'аэ ' => 0, + 'бё ' => 0, + 'бб ' => 0, + 'бв ' => 0, + 'бг ' => 0, + 'бд ' => 0, + 'бж ' => 0, + 'бз ' => 0, + 'бй' => 0, + 'бй ' => 0, + 'бк ' => 0, + 'бм ' => 0, + 'бн ' => 0, + 'бп ' => 0, + 'бт ' => 0, + 'бф ' => 0, + 'бх ' => 0, + 'бц ' => 0, + 'бч ' => 0, + 'бш ' => 0, + 'бщ ' => 0, + 'бъ ' => 0, + 'бэ ' => 0, + 'вё ' => 0, + 'вб ' => 0, + 'вв ' => 0, + 'вд ' => 0, + 'вж' => 0, + 'вж ' => 0, + 'вз ' => 0, + 'вй' => 0, + 'вй ' => 0, + 'вл ' => 0, + 'вп ' => 0, + 'вф ' => 0, + 'вц ' => 0, + 'вч ' => 0, + 'вщ ' => 0, + 'въ' => 0, + 'въ ' => 0, + 'вэ ' => 0, + 'гё' => 0, + 'гё ' => 0, + 'гб ' => 0, + 'гг ' => 0, + 'гж' => 0, + 'гж ' => 0, + 'гз ' => 0, + 'гй' => 0, + 'гй ' => 0, + 'гк ' => 0, + 'гн ' => 0, + 'гп ' => 0, + 'гф ' => 0, + 'гх' => 0, + 'гх ' => 0, + 'гц' => 0, + 'гц ' => 0, + 'гч ' => 0, + 'гш ' => 0, + 'гщ ' => 0, + 'гъ' => 0, + 'гъ ' => 0, + 'гы ' => 0, + 'гь' => 0, + 'гь ' => 0, + 'гэ ' => 0, + 'гю' => 0, + 'гю ' => 0, + 'гя' => 0, + 'гя ' => 0, + 'дё ' => 0, + 'дб ' => 0, + 'дг ' => 0, + 'дд ' => 0, + 'дй' => 0, + 'дй ' => 0, + 'дк ' => 0, + 'дм ' => 0, + 'дн ' => 0, + 'дп ' => 0, + 'дс ' => 0, + 'дф ' => 0, + 'дх ' => 0, + 'дц ' => 0, + 'дч ' => 0, + 'дш ' => 0, + 'дщ ' => 0, + 'дъ ' => 0, + 'еа ' => 0, + 'еу ' => 0, + 'еъ' => 0, + 'еъ ' => 0, + 'еы' => 0, + 'еы ' => 0, + 'еь' => 0, + 'еь ' => 0, + 'еэ ' => 0, + 'жё ' => 0, + 'жв ' => 0, + 'жг ' => 0, + 'жж ' => 0, + 'жз ' => 0, + 'жй' => 0, + 'жй ' => 0, + 'жк ' => 0, + 'жл ' => 0, + 'жн ' => 0, + 'жп ' => 0, + 'жр ' => 0, + 'жс ' => 0, + 'жт ' => 0, + 'жф ' => 0, + 'жх ' => 0, + 'жц ' => 0, + 'жч ' => 0, + 'жш' => 0, + 'жш ' => 0, + 'жщ' => 0, + 'жщ ' => 0, + 'жъ' => 0, + 'жъ ' => 0, + 'жы ' => 0, + 'жэ ' => 0, + 'жю' => 0, + 'жю ' => 0, + 'жя' => 0, + 'жя ' => 0, + 'зё ' => 0, + 'зж ' => 0, + 'зз ' => 0, + 'зй' => 0, + 'зй ' => 0, + 'зк ' => 0, + 'зп ' => 0, + 'зр ' => 0, + 'зс ' => 0, + 'зт ' => 0, + 'зф' => 0, + 'зф ' => 0, + 'зх' => 0, + 'зх ' => 0, + 'зц ' => 0, + 'зч ' => 0, + 'зш ' => 0, + 'зщ' => 0, + 'зщ ' => 0, + 'зъ ' => 0, + 'зэ ' => 0, + 'иъ' => 0, + 'иъ ' => 0, + 'иы' => 0, + 'иы ' => 0, + 'иь' => 0, + 'иь ' => 0, + 'иэ ' => 0, + 'йё' => 0, + 'йё ' => 0, + 'йа ' => 0, + 'йв ' => 0, + 'йг ' => 0, + 'йж' => 0, + 'йж ' => 0, + 'йз ' => 0, + 'йи ' => 0, + 'йй' => 0, + 'йй ' => 0, + 'йо ' => 0, + 'йу' => 0, + 'йу ' => 0, + 'йч ' => 0, + 'йш ' => 0, + 'йщ ' => 0, + 'йъ' => 0, + 'йъ ' => 0, + 'йы' => 0, + 'йы ' => 0, + 'йь' => 0, + 'йь ' => 0, + 'йэ' => 0, + 'йэ ' => 0, + 'йю' => 0, + 'йю ' => 0, + 'йя' => 0, + 'кё ' => 0, + 'кб ' => 0, + 'кг ' => 0, + 'кд ' => 0, + 'кж ' => 0, + 'кз ' => 0, + 'кй' => 0, + 'кй ' => 0, + 'км ' => 0, + 'кн ' => 0, + 'кф ' => 0, + 'кц ' => 0, + 'кч ' => 0, + 'кш ' => 0, + 'кщ' => 0, + 'кщ ' => 0, + 'къ' => 0, + 'къ ' => 0, + 'кы ' => 0, + 'кь ' => 0, + 'кэ' => 0, + 'кэ ' => 0, + 'кя' => 0, + 'кя ' => 0, + 'лв ' => 0, + 'лж ' => 0, + 'лз ' => 0, + 'лй' => 0, + 'лй ' => 0, + 'лр ' => 0, + 'лф ' => 0, + 'лх ' => 0, + 'лц ' => 0, + 'лч ' => 0, + 'лш ' => 0, + 'лщ ' => 0, + 'лъ' => 0, + 'лъ ' => 0, + 'лэ' => 0, + 'лэ ' => 0, + 'мё ' => 0, + 'мв ' => 0, + 'мг ' => 0, + 'мд ' => 0, + 'мз ' => 0, + 'мй' => 0, + 'мк ' => 0, + 'мл ' => 0, + 'мр ' => 0, + 'мх ' => 0, + 'мц ' => 0, + 'мч ' => 0, + 'мш ' => 0, + 'мщ ' => 0, + 'мъ' => 0, + 'мъ ' => 0, + 'мэ ' => 0, + 'мю ' => 0, + 'нё ' => 0, + 'нб ' => 0, + 'нв ' => 0, + 'нй' => 0, + 'нл ' => 0, + 'нп ' => 0, + 'нщ ' => 0, + 'нъ ' => 0, + 'нэ ' => 0, + 'оъ' => 0, + 'оъ ' => 0, + 'оы' => 0, + 'оы ' => 0, + 'оь' => 0, + 'оь ' => 0, + 'пё ' => 0, + 'пб ' => 0, + 'пв' => 0, + 'пв ' => 0, + 'пг' => 0, + 'пг ' => 0, + 'пд ' => 0, + 'пж' => 0, + 'пж ' => 0, + 'пз' => 0, + 'пз ' => 0, + 'пй' => 0, + 'пй ' => 0, + 'пк ' => 0, + 'пл ' => 0, + 'пм ' => 0, + 'пн ' => 0, + 'пф ' => 0, + 'пх ' => 0, + 'пц ' => 0, + 'пч ' => 0, + 'пш ' => 0, + 'пщ ' => 0, + 'пъ' => 0, + 'пъ ' => 0, + 'пэ' => 0, + 'пэ ' => 0, + 'пю ' => 0, + 'рё ' => 0, + 'рй' => 0, + 'рй ' => 0, + 'ръ' => 0, + 'ръ ' => 0, + 'рэ ' => 0, + 'сб ' => 0, + 'св ' => 0, + 'сг ' => 0, + 'сд ' => 0, + 'сж ' => 0, + 'сз' => 0, + 'сз ' => 0, + 'сй' => 0, + 'сй ' => 0, + 'сн ' => 0, + 'сп ' => 0, + 'сф ' => 0, + 'сц ' => 0, + 'сч ' => 0, + 'сш ' => 0, + 'сщ ' => 0, + 'съ ' => 0, + 'сэ ' => 0, + 'тб ' => 0, + 'тг ' => 0, + 'тд ' => 0, + 'тж ' => 0, + 'тз ' => 0, + 'тй' => 0, + 'тй ' => 0, + 'тк ' => 0, + 'тл ' => 0, + 'тп ' => 0, + 'тф ' => 0, + 'тх ' => 0, + 'тц ' => 0, + 'тш ' => 0, + 'тщ ' => 0, + 'тъ ' => 0, + 'уё ' => 0, + 'уо ' => 0, + 'уу ' => 0, + 'уц ' => 0, + 'уъ' => 0, + 'уъ ' => 0, + 'уы' => 0, + 'уы ' => 0, + 'уь' => 0, + 'уь ' => 0, + 'уэ ' => 0, + 'фё ' => 0, + 'фб ' => 0, + 'фв ' => 0, + 'фг ' => 0, + 'фд ' => 0, + 'фж' => 0, + 'фж ' => 0, + 'фз' => 0, + 'фз ' => 0, + 'фй' => 0, + 'фй ' => 0, + 'фк ' => 0, + 'фл ' => 0, + 'фн ' => 0, + 'фп' => 0, + 'фп ' => 0, + 'фс ' => 0, + 'фх' => 0, + 'фх ' => 0, + 'фц' => 0, + 'фц ' => 0, + 'фч ' => 0, + 'фш ' => 0, + 'фщ ' => 0, + 'фъ' => 0, + 'фъ ' => 0, + 'фэ' => 0, + 'фэ ' => 0, + 'фю ' => 0, + 'хё' => 0, + 'хё ' => 0, + 'хб ' => 0, + 'хг ' => 0, + 'хд ' => 0, + 'хж ' => 0, + 'хз ' => 0, + 'хй' => 0, + 'хй ' => 0, + 'хк ' => 0, + 'хн ' => 0, + 'хп ' => 0, + 'хр ' => 0, + 'хс ' => 0, + 'хф ' => 0, + 'хх ' => 0, + 'хц ' => 0, + 'хч ' => 0, + 'хш ' => 0, + 'хщ' => 0, + 'хщ ' => 0, + 'хъ ' => 0, + 'хы' => 0, + 'хы ' => 0, + 'хь' => 0, + 'хь ' => 0, + 'хэ ' => 0, + 'хю' => 0, + 'хю ' => 0, + 'хя' => 0, + 'хя ' => 0, + 'цё' => 0, + 'цё ' => 0, + 'цб' => 0, + 'цб ' => 0, + 'цв ' => 0, + 'цг ' => 0, + 'цд ' => 0, + 'цж' => 0, + 'цж ' => 0, + 'цз ' => 0, + 'цй' => 0, + 'цй ' => 0, + 'цк ' => 0, + 'цл ' => 0, + 'цм ' => 0, + 'цн ' => 0, + 'цп ' => 0, + 'цр ' => 0, + 'цс ' => 0, + 'цт ' => 0, + 'цф' => 0, + 'цф ' => 0, + 'цх' => 0, + 'цх ' => 0, + 'цц ' => 0, + 'цч' => 0, + 'цч ' => 0, + 'цш ' => 0, + 'цщ' => 0, + 'цщ ' => 0, + 'цъ' => 0, + 'цъ ' => 0, + 'ць' => 0, + 'ць ' => 0, + 'цэ' => 0, + 'цэ ' => 0, + 'цю' => 0, + 'цю ' => 0, + 'ця' => 0, + 'ця ' => 0, + 'чё ' => 0, + 'чб ' => 0, + 'чг' => 0, + 'чг ' => 0, + 'чд' => 0, + 'чд ' => 0, + 'чж ' => 0, + 'чз' => 0, + 'чз ' => 0, + 'чй' => 0, + 'чй ' => 0, + 'чк ' => 0, + 'чл ' => 0, + 'чм ' => 0, + 'чн ' => 0, + 'чп' => 0, + 'чп ' => 0, + 'чр ' => 0, + 'чс ' => 0, + 'чф' => 0, + 'чф ' => 0, + 'чх ' => 0, + 'чц ' => 0, + 'чч ' => 0, + 'чш ' => 0, + 'чщ' => 0, + 'чщ ' => 0, + 'чъ' => 0, + 'чъ ' => 0, + 'чы' => 0, + 'чы ' => 0, + 'чэ' => 0, + 'чэ ' => 0, + 'чю' => 0, + 'чю ' => 0, + 'чя' => 0, + 'чя ' => 0, + 'шё ' => 0, + 'шб ' => 0, + 'шг ' => 0, + 'шд' => 0, + 'шд ' => 0, + 'шж' => 0, + 'шж ' => 0, + 'шз' => 0, + 'шз ' => 0, + 'шй' => 0, + 'шй ' => 0, + 'шк ' => 0, + 'шл ' => 0, + 'шм ' => 0, + 'шн ' => 0, + 'шп ' => 0, + 'шр ' => 0, + 'шс ' => 0, + 'шф ' => 0, + 'шх' => 0, + 'шх ' => 0, + 'шч ' => 0, + 'шш' => 0, + 'шш ' => 0, + 'шщ' => 0, + 'шщ ' => 0, + 'шъ' => 0, + 'шъ ' => 0, + 'шы' => 0, + 'шы ' => 0, + 'шэ' => 0, + 'шэ ' => 0, + 'шя' => 0, + 'шя ' => 0, + 'щб' => 0, + 'щб ' => 0, + 'щв ' => 0, + 'щг' => 0, + 'щг ' => 0, + 'щд' => 0, + 'щд ' => 0, + 'щж' => 0, + 'щж ' => 0, + 'щз' => 0, + 'щз ' => 0, + 'щй' => 0, + 'щй ' => 0, + 'щк' => 0, + 'щк ' => 0, + 'щл' => 0, + 'щл ' => 0, + 'щм ' => 0, + 'щн ' => 0, + 'щп' => 0, + 'щп ' => 0, + 'щр ' => 0, + 'щс' => 0, + 'щс ' => 0, + 'щт' => 0, + 'щт ' => 0, + 'щф' => 0, + 'щф ' => 0, + 'щх' => 0, + 'щх ' => 0, + 'щц' => 0, + 'щц ' => 0, + 'щч' => 0, + 'щч ' => 0, + 'щш' => 0, + 'щш ' => 0, + 'щщ' => 0, + 'щщ ' => 0, + 'щъ' => 0, + 'щъ ' => 0, + 'щы' => 0, + 'щы ' => 0, + 'щэ' => 0, + 'щэ ' => 0, + 'щю' => 0, + 'щю ' => 0, + 'щя' => 0, + 'щя ' => 0, + 'ъё ' => 0, + 'ъа' => 0, + 'ъа ' => 0, + 'ъб' => 0, + 'ъб ' => 0, + 'ъв' => 0, + 'ъв ' => 0, + 'ъг' => 0, + 'ъг ' => 0, + 'ъд' => 0, + 'ъд ' => 0, + 'ъе ' => 0, + 'ъж' => 0, + 'ъж ' => 0, + 'ъз' => 0, + 'ъз ' => 0, + 'ъи' => 0, + 'ъи ' => 0, + 'ъй' => 0, + 'ъй ' => 0, + 'ък' => 0, + 'ък ' => 0, + 'ъл' => 0, + 'ъл ' => 0, + 'ъм' => 0, + 'ъм ' => 0, + 'ън' => 0, + 'ън ' => 0, + 'ъо' => 0, + 'ъо ' => 0, + 'ъп' => 0, + 'ъп ' => 0, + 'ър' => 0, + 'ър ' => 0, + 'ъс' => 0, + 'ъс ' => 0, + 'ът' => 0, + 'ът ' => 0, + 'ъу' => 0, + 'ъу ' => 0, + 'ъф' => 0, + 'ъф ' => 0, + 'ъх' => 0, + 'ъх ' => 0, + 'ъц' => 0, + 'ъц ' => 0, + 'ъч' => 0, + 'ъч ' => 0, + 'ъш' => 0, + 'ъш ' => 0, + 'ъщ' => 0, + 'ъщ ' => 0, + 'ъъ' => 0, + 'ъъ ' => 0, + 'ъы' => 0, + 'ъы ' => 0, + 'ъь' => 0, + 'ъь ' => 0, + 'ъэ' => 0, + 'ъэ ' => 0, + 'ъю ' => 0, + 'ъя ' => 0, + 'ыё' => 0, + 'ыё ' => 0, + 'ыа' => 0, + 'ыа ' => 0, + 'ыи ' => 0, + 'ыо ' => 0, + 'ыу ' => 0, + 'ыф ' => 0, + 'ыъ' => 0, + 'ыъ ' => 0, + 'ыы' => 0, + 'ыы ' => 0, + 'ыь' => 0, + 'ыь ' => 0, + 'ыэ' => 0, + 'ыэ ' => 0, + 'ыю ' => 0, + 'ьа' => 0, + 'ьа ' => 0, + 'ьв ' => 0, + 'ьг ' => 0, + 'ьж ' => 0, + 'ьз ' => 0, + 'ьй' => 0, + 'ьй ' => 0, + 'ьл ' => 0, + 'ьн ' => 0, + 'ьр ' => 0, + 'ьу' => 0, + 'ьу ' => 0, + 'ьх ' => 0, + 'ьщ ' => 0, + 'ьъ' => 0, + 'ьъ ' => 0, + 'ьы ' => 0, + 'ьь' => 0, + 'ьь ' => 0, + 'ьэ ' => 0, + 'эё' => 0, + 'эё ' => 0, + 'эа' => 0, + 'эа ' => 0, + 'эб' => 0, + 'эб ' => 0, + 'эв ' => 0, + 'эг ' => 0, + 'эд ' => 0, + 'эе' => 0, + 'эе ' => 0, + 'эж' => 0, + 'эж ' => 0, + 'эз ' => 0, + 'эи ' => 0, + 'эй ' => 0, + 'эл ' => 0, + 'эм ' => 0, + 'эн ' => 0, + 'эо' => 0, + 'эо ' => 0, + 'эу' => 0, + 'эу ' => 0, + 'эф ' => 0, + 'эх ' => 0, + 'эц' => 0, + 'эц ' => 0, + 'эч' => 0, + 'эч ' => 0, + 'эш ' => 0, + 'эщ' => 0, + 'эщ ' => 0, + 'эъ' => 0, + 'эъ ' => 0, + 'эы' => 0, + 'эы ' => 0, + 'эь' => 0, + 'эь ' => 0, + 'ээ ' => 0, + 'эю' => 0, + 'эю ' => 0, + 'эя' => 0, + 'эя ' => 0, + 'юё' => 0, + 'юё ' => 0, + 'юа ' => 0, + 'юе ' => 0, + 'юж ' => 0, + 'юи ' => 0, + 'юл ' => 0, + 'юо ' => 0, + 'юу' => 0, + 'юу ' => 0, + 'юц ' => 0, + 'юъ' => 0, + 'юъ ' => 0, + 'юы' => 0, + 'юы ' => 0, + 'юь' => 0, + 'юь ' => 0, + 'юэ ' => 0, + 'юя' => 0, + 'яё' => 0, + 'яё ' => 0, + 'яа' => 0, + 'яа ' => 0, + 'яе ' => 0, + 'яо ' => 0, + 'яф' => 0, + 'яф ' => 0, + 'яъ' => 0, + 'яъ ' => 0, + 'яы' => 0, + 'яы ' => 0, + 'яь' => 0, + 'яь ' => 0, + 'яэ' => 0, + 'яэ ' => 0, + #en + ' \'f' => 0, + ' \'p' => 0, + ' \'q' => 0, + ' \'r' => 0, + ' \'x' => 0, + ' \'y' => 0, + ' \'z' => 0, + ' bj' => 0, + ' bq' => 0, + ' bz' => 0, + ' c\'' => 0, + ' cq' => 0, + ' cv' => 0, + ' cx' => 0, + ' dq' => 0, + ' dx' => 0, + ' ez' => 0, + ' f\'' => 0, + ' fh' => 0, + ' fk' => 0, + ' fq' => 0, + ' fv' => 0, + ' fw' => 0, + ' fz' => 0, + ' g\'' => 0, + ' gf' => 0, + ' gg' => 0, + ' gj' => 0, + ' gv' => 0, + ' gx' => 0, + ' gz' => 0, + ' h\'' => 0, + ' hj' => 0, + ' hk' => 0, + ' hn' => 0, + ' hq' => 0, + ' hx' => 0, + ' iq' => 0, + ' iw' => 0, + ' iy' => 0, + ' jb' => 0, + ' jf' => 0, + ' jh' => 0, + ' jj' => 0, + ' jk' => 0, + ' jl' => 0, + ' jm' => 0, + ' jq' => 0, + ' jw' => 0, + ' jx' => 0, + ' jy' => 0, + ' jz' => 0, + ' k\'' => 0, + ' kf' => 0, + ' kj' => 0, + ' kq' => 0, + ' kt' => 0, + ' kx' => 0, + ' kz' => 0, + ' lj' => 0, + ' lk' => 0, + ' lq' => 0, + ' lv' => 0, + ' mj' => 0, + ' mq' => 0, + ' mz' => 0, + ' nj' => 0, + ' nk' => 0, + ' nq' => 0, + ' nz' => 0, + ' oq' => 0, + ' pj' => 0, + ' pz' => 0, + ' qb' => 0, + ' qe' => 0, + ' qf' => 0, + ' qg' => 0, + ' qh' => 0, + ' qj' => 0, + ' qk' => 0, + ' qo' => 0, + ' qp' => 0, + ' qs' => 0, + ' qv' => 0, + ' qx' => 0, + ' qy' => 0, + ' qz' => 0, + ' rb' => 0, + ' rk' => 0, + ' rq' => 0, + ' rv' => 0, + ' rx' => 0, + ' rz' => 0, + ' sz' => 0, + ' tf' => 0, + ' tg' => 0, + ' tj' => 0, + ' tq' => 0, + ' u\'' => 0, + ' ue' => 0, + ' uj' => 0, + ' uo' => 0, + ' uq' => 0, + ' uu' => 0, + ' uy' => 0, + ' vb' => 0, + ' vj' => 0, + ' vk' => 0, + ' vn' => 0, + ' vq' => 0, + ' vr' => 0, + ' vv' => 0, + ' vw' => 0, + ' vx' => 0, + ' vy' => 0, + ' vz' => 0, + ' wj' => 0, + ' wl' => 0, + ' wn' => 0, + ' wq' => 0, + ' wx' => 0, + ' wz' => 0, + ' xb' => 0, + ' xf' => 0, + ' xg' => 0, + ' xh' => 0, + ' xj' => 0, + ' xk' => 0, + ' xq' => 0, + ' xt' => 0, + ' xu' => 0, + ' xz' => 0, + ' yf' => 0, + ' yg' => 0, + ' yh' => 0, + ' yj' => 0, + ' yk' => 0, + ' yl' => 0, + ' yn' => 0, + ' yq' => 0, + ' yv' => 0, + ' yx' => 0, + ' yy' => 0, + ' yz' => 0, + ' z\'' => 0, + ' zb' => 0, + ' zc' => 0, + ' zd' => 0, + ' zf' => 0, + ' zg' => 0, + ' zh' => 0, + ' zj' => 0, + ' zk' => 0, + ' zl' => 0, + ' zm' => 0, + ' zq' => 0, + ' zr' => 0, + ' zv' => 0, + ' zw' => 0, + ' zx' => 0, + ' zz' => 0, + '\'a ' => 0, + '\'b' => 0, + '\'b ' => 0, + '\'c ' => 0, + '\'f' => 0, + '\'f ' => 0, + '\'g' => 0, + '\'g ' => 0, + '\'h ' => 0, + '\'i ' => 0, + '\'j' => 0, + '\'j ' => 0, + '\'k' => 0, + '\'k ' => 0, + '\'l ' => 0, + '\'n ' => 0, + '\'o ' => 0, + '\'p ' => 0, + '\'q' => 0, + '\'q ' => 0, + '\'r ' => 0, + '\'u' => 0, + '\'u ' => 0, + '\'v ' => 0, + '\'w ' => 0, + '\'x' => 0, + '\'x ' => 0, + '\'z' => 0, + '\'z ' => 0, + 'b\' ' => 0, + 'bg ' => 0, + 'bh ' => 0, + 'bp ' => 0, + 'bq' => 0, + 'bq ' => 0, + 'bv ' => 0, + 'bx' => 0, + 'bz' => 0, + 'bz ' => 0, + 'c\' ' => 0, + 'cf ' => 0, + 'cj' => 0, + 'cn ' => 0, + 'cq ' => 0, + 'cv' => 0, + 'cw' => 0, + 'cx' => 0, + 'cx ' => 0, + 'cz ' => 0, + 'db ' => 0, + 'dj ' => 0, + 'dk ' => 0, + 'dw ' => 0, + 'dx' => 0, + 'eh ' => 0, + 'ej ' => 0, + 'f\' ' => 0, + 'fg ' => 0, + 'fh ' => 0, + 'fj' => 0, + 'fj ' => 0, + 'fk' => 0, + 'fk ' => 0, + 'fq' => 0, + 'fq ' => 0, + 'fv ' => 0, + 'fw ' => 0, + 'fx' => 0, + 'fx ' => 0, + 'fz' => 0, + 'fz ' => 0, + 'g\' ' => 0, + 'gc ' => 0, + 'gf ' => 0, + 'gj ' => 0, + 'gk ' => 0, + 'gl ' => 0, + 'gq' => 0, + 'gq ' => 0, + 'gv' => 0, + 'gv ' => 0, + 'gw ' => 0, + 'gx' => 0, + 'gx ' => 0, + 'gz ' => 0, + 'hb ' => 0, + 'hc ' => 0, + 'hg ' => 0, + 'hh ' => 0, + 'hj' => 0, + 'hj ' => 0, + 'hk ' => 0, + 'hv' => 0, + 'hv ' => 0, + 'hw ' => 0, + 'hx' => 0, + 'hx ' => 0, + 'hz' => 0, + 'i\' ' => 0, + 'ih ' => 0, + 'iq ' => 0, + 'iw ' => 0, + 'j\'' => 0, + 'j\' ' => 0, + 'jb' => 0, + 'jb ' => 0, + 'jc' => 0, + 'jc ' => 0, + 'jd' => 0, + 'jf' => 0, + 'jg' => 0, + 'jg ' => 0, + 'jh' => 0, + 'jh ' => 0, + 'jj' => 0, + 'jj ' => 0, + 'jk ' => 0, + 'jl ' => 0, + 'jm' => 0, + 'jm ' => 0, + 'jn' => 0, + 'jn ' => 0, + 'jp ' => 0, + 'jq' => 0, + 'jq ' => 0, + 'jr' => 0, + 'jr ' => 0, + 'js' => 0, + 'js ' => 0, + 'jt' => 0, + 'ju ' => 0, + 'jv' => 0, + 'jv ' => 0, + 'jw' => 0, + 'jw ' => 0, + 'jx' => 0, + 'jx ' => 0, + 'jy' => 0, + 'jy ' => 0, + 'jz' => 0, + 'jz ' => 0, + 'kb ' => 0, + 'kc ' => 0, + 'kd ' => 0, + 'kj ' => 0, + 'km ' => 0, + 'kp ' => 0, + 'kq' => 0, + 'kq ' => 0, + 'kv' => 0, + 'kv ' => 0, + 'kx' => 0, + 'kx ' => 0, + 'kz' => 0, + 'kz ' => 0, + 'lg ' => 0, + 'lh ' => 0, + 'lj ' => 0, + 'lq ' => 0, + 'lr ' => 0, + 'lv ' => 0, + 'lw ' => 0, + 'lx' => 0, + 'lz ' => 0, + 'm\' ' => 0, + 'mg ' => 0, + 'mh ' => 0, + 'mj ' => 0, + 'mk ' => 0, + 'mq' => 0, + 'mq ' => 0, + 'mx' => 0, + 'mx ' => 0, + 'mz' => 0, + 'nb ' => 0, + 'nm ' => 0, + 'pj ' => 0, + 'pk ' => 0, + 'pq' => 0, + 'pq ' => 0, + 'pv' => 0, + 'pw ' => 0, + 'px' => 0, + 'px ' => 0, + 'pz ' => 0, + 'q\'' => 0, + 'q\' ' => 0, + 'qa ' => 0, + 'qb' => 0, + 'qb ' => 0, + 'qc' => 0, + 'qc ' => 0, + 'qd' => 0, + 'qd ' => 0, + 'qe' => 0, + 'qe ' => 0, + 'qf' => 0, + 'qf ' => 0, + 'qg' => 0, + 'qg ' => 0, + 'qh' => 0, + 'qh ' => 0, + 'qi' => 0, + 'qj' => 0, + 'qj ' => 0, + 'qk' => 0, + 'qk ' => 0, + 'ql' => 0, + 'ql ' => 0, + 'qm' => 0, + 'qm ' => 0, + 'qn' => 0, + 'qn ' => 0, + 'qo' => 0, + 'qo ' => 0, + 'qp' => 0, + 'qp ' => 0, + 'qq' => 0, + 'qq ' => 0, + 'qr' => 0, + 'qs' => 0, + 'qs ' => 0, + 'qt' => 0, + 'qt ' => 0, + 'qu ' => 0, + 'qv' => 0, + 'qv ' => 0, + 'qw' => 0, + 'qw ' => 0, + 'qx' => 0, + 'qx ' => 0, + 'qy' => 0, + 'qy ' => 0, + 'qz' => 0, + 'qz ' => 0, + 'rq ' => 0, + 'rz ' => 0, + 'sg ' => 0, + 'sj ' => 0, + 'sx' => 0, + 'sx ' => 0, + 'sz' => 0, + 'sz ' => 0, + 'tg ' => 0, + 'tj ' => 0, + 'tq' => 0, + 'tq ' => 0, + 'tx' => 0, + 'tx ' => 0, + 'uj ' => 0, + 'uq ' => 0, + 'uu ' => 0, + 'uw ' => 0, + 'v\' ' => 0, + 'vb' => 0, + 'vb ' => 0, + 'vc' => 0, + 'vf' => 0, + 'vf ' => 0, + 'vg' => 0, + 'vh' => 0, + 'vh ' => 0, + 'vj' => 0, + 'vj ' => 0, + 'vk' => 0, + 'vk ' => 0, + 'vl ' => 0, + 'vm' => 0, + 'vn ' => 0, + 'vp' => 0, + 'vp ' => 0, + 'vq' => 0, + 'vq ' => 0, + 'vr ' => 0, + 'vv ' => 0, + 'vw' => 0, + 'vw ' => 0, + 'vx' => 0, + 'vz' => 0, + 'vz ' => 0, + 'w\' ' => 0, + 'wb ' => 0, + 'wc ' => 0, + 'wf ' => 0, + 'wg ' => 0, + 'wj' => 0, + 'wj ' => 0, + 'wq' => 0, + 'wq ' => 0, + 'wr ' => 0, + 'wv' => 0, + 'wv ' => 0, + 'wx' => 0, + 'wz ' => 0, + 'x\'' => 0, + 'x\' ' => 0, + 'xa ' => 0, + 'xb ' => 0, + 'xc ' => 0, + 'xd' => 0, + 'xd ' => 0, + 'xf ' => 0, + 'xg ' => 0, + 'xh ' => 0, + 'xj' => 0, + 'xj ' => 0, + 'xk' => 0, + 'xk ' => 0, + 'xl ' => 0, + 'xm ' => 0, + 'xn' => 0, + 'xn ' => 0, + 'xp ' => 0, + 'xq ' => 0, + 'xr' => 0, + 'xr ' => 0, + 'xs ' => 0, + 'xu ' => 0, + 'xv' => 0, + 'xv ' => 0, + 'xw ' => 0, + 'xx' => 0, + 'xz' => 0, + 'xz ' => 0, + 'yb ' => 0, + 'yc ' => 0, + 'yd ' => 0, + 'yf ' => 0, + 'yg ' => 0, + 'yh ' => 0, + 'yj ' => 0, + 'yq' => 0, + 'yq ' => 0, + 'yu ' => 0, + 'yv ' => 0, + 'yw ' => 0, + 'yy' => 0, + 'yy ' => 0, + 'yz ' => 0, + 'z\'' => 0, + 'z\' ' => 0, + 'zb ' => 0, + 'zc' => 0, + 'zc ' => 0, + 'zd' => 0, + 'zd ' => 0, + 'zf' => 0, + 'zf ' => 0, + 'zg ' => 0, + 'zh' => 0, + 'zh ' => 0, + 'zj' => 0, + 'zj ' => 0, + 'zk ' => 0, + 'zl ' => 0, + 'zn' => 0, + 'zn ' => 0, + 'zp ' => 0, + 'zq' => 0, + 'zq ' => 0, + 'zr' => 0, + 'zr ' => 0, + 'zs' => 0, + 'zs ' => 0, + 'zt' => 0, + 'zt ' => 0, + 'zu ' => 0, + 'zv ' => 0, + 'zw ' => 0, + 'zx' => 0, + 'zx ' => 0, + ); + + /** + * + * @param array|null $words_exceptions + */ + public function __construct(array $words_exceptions = null) + { + if (! ReflectionTypeHint::isValid()) return false; + #русский --> английский: + $this->en_correct = '/(?: (?:' . $this->tt_f . ') + (?: (?:' . $this->en_uniq . ') | (?:' . $this->en_sc . '){2} ) + | (?:' . $this->en_sc . ') + (?:' . $this->tt_f . ') + (?:' . $this->en_sc . ') + | (?: (?:' . $this->en_uniq . ') | (?:' . $this->en_sc . '){2} ) + (?:' . $this->tt_f . ') + ) + /sxSX'; + #английский --> русский: + $this->tt_correct = '/(?: (?:' . $this->en_sc . ') + (?: (?:' . $this->tt_uniq . ') | (?:' . $this->tt_f . '){2} ) + | (?:' . $this->tt_f . ') + (?:' . $this->en_sc . ') + (?:' . $this->tt_f . ') + | (?: (?:' . $this->tt_uniq . ') | (?:' . $this->tt_f . '){2} ) + (?:' . $this->en_sc . ') + ) + /sxSX'; + $this->table_flip = array( + 0 => array_flip($this->table[0]), + 1 => array_flip($this->table[1]), + ); + if (is_array($words_exceptions)) $this->words_exceptions += $words_exceptions; + } + + /** + * Исправляет клавиатурные опечатки в тексте. + * + * @param scalar|null $s Текст в кодировке UTF-8. + * @param int $mode Константы self::SIMILAR_CHARS и/или self::KEYBOARD_LAYOUT, + * (их можно комбинировать). Описание констант см. выше. + * При использовании self::KEYBOARD_LAYOUT время работы увеличивается примерно в 10 раз. + * @param array &$words Ассоц. массив со словами, которые были исправлены: + * в ключах оригиналы, в значениях исправленные слова. + * @return string|bool Returns FALSE if error occured + */ + public function parse($s, $mode = self::SIMILAR_CHARS, array &$words = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (! is_string($s)) return $s; + + if ($mode < self::SIMILAR_CHARS || $mode > (self::SIMILAR_CHARS | self::KEYBOARD_LAYOUT | self::ADD_FIX)) + { + trigger_error('Unknown mode', E_USER_WARNING); + return false; + } + + $this->mode = $mode; + + #вырезаем и заменяем некоторые символы + $additional_chars = array( + "\xc2\xad", #"мягкие" переносы строк (­) + ); + #http://ru.wikipedia.org/wiki/Диакритические_знаки + $s = UTF8::diactrical_remove($s, $additional_chars, $is_can_restored = true, $restore_table); + + $this->words = array(); + $s = $this->_parse1($s); + $s = $this->_parse2($s); + $s = UTF8::diactrical_restore($s, $restore_table); + $words = $this->words; + return $s; + } + + private function _parse1($s) + { + #заменяем слова из текста, минимальная длина -- 3 символа, меньше нельзя + return preg_replace_callback('/(?> (' . $this->en . ') #1 латинские буквы + | (' . $this->tt . ') #2 русские буквы + | (' . $this->sc . ') #3 символы, которые м.б. набраны по ошибке в английской раскладке клавиатуры вместо русских букв + ){3,}+ + /sxSX', array($this, '_word'), $s); + } + + private function _parse2($s) + { + #исправляем русские буквы (похожие на латинские) с рядом стоящими цифрами на латинские + #например, это м. б. каталожные номера автозапчастей, в которых есть русские буквы: 1500A023, 52511-60900-H0, K2305, XA527672 + #корректно обрабатываем вхождения '1-ое', 'Ту-134', 'А19-3107/06-43-Ф02-4227/06-С1' + if (version_compare(PHP_VERSION, '5.2.0', '<')) return $s; + return preg_replace_callback('~(?: (?<=[^-_/]|^) + (?:' . $this->ru_similar . ')++ + (?= (?:' . $this->en . '|[-_/])*+ (?<=[^-_/]|' . $this->en . '[-_/]) + \d [\d-_/]*+ (?!' . $this->tt_uniq . ') + ) + | (?<=[^-_/]|^) + \d (?:' . $this->en . '|[-_/])*+ (?<=[^-_/]|' . $this->en . '[-_/]) + \K + (?:' . $this->ru_similar . ')++ + (?= [\d-_/]*+ (?!' . $this->tt_uniq . ') ) + ) + ~sxSX', array($this, '_entry'), $s); + } + + private function _entry(array &$a) + { + $entry =& $a[0]; + $s = strtr($entry, $this->table[0]); + if ($s !== $entry) $this->words[$entry] = $s; + return $s; + } + + private function _word(array &$a) + { + $word = $a[0]; + #var_export($a); + + $suggestions = array(); + + #если найдено слово из мешанины русских и латинских букв + if (! empty($a[1]) && ! empty($a[2])) + { + if (($this->mode & self::SIMILAR_CHARS) === 0) return $word; + #ВНИМАНИЕ! порядок следования правил преобразования имеет значение! + + /* + Исправляем ошибочно набранные буквы, которые выглядят одинаково + в инициалах перед фамилиями (русский <--> английский), например: Т.С.Навка + */ + + #0a. английский --> русский: + if (substr($word, 1, 1) === '.' #оптимизация + && preg_match('/^ ( ' . $this->en_similar_uc . '\. #первый инициал + (?:' . $this->en_similar_uc . '\.)? #второй инициал (необязательно) + ) #1 инициалы + (' . $this->no_sc . '{2,}+) #2 фамилия (английские и русские буквы) + $/sxSX', $word, $m)) + { + $m[2] = $this->_parse1($m[2]); + #фамилия по-русски? + if (preg_match('/^ (?:' . $this->tt_uc . ') #первая буква д.б. большая + (?:' . $this->tt_f . ')+ #минимальное кол-во букв в фамилии = 2 + $/sxSX', $m[2])) return strtr($m[1], $this->table_flip[0]) . $m[2]; + } + + #0b. русский --> английский: + if (substr($word, 2, 1) === '.' #оптимизация + && preg_match('/^ ( ' . $this->ru_similar_uc . '\. #первый инициал + (?:' . $this->ru_similar_uc . '\.)? #второй инициал (необязательно) + ) #1 инициалы + (' . $this->no_sc . '{2,}+) #2 фамилия (английские и русские буквы) + $/sxSX', $word, $m)) + { + $m[2] = $this->_parse1($m[2]); + #фамилия по-англ.? + if (preg_match('/^ ' . $this->en_uc . ' #первая буква д.б. большая + ' . $this->en . '++ #минимальное кол-во букв в фамилии = 2 + $/sxSX', $m[2])) return strtr($m[1], $this->table[0]) . $m[2]; + } + + #1. английский --> русский: + $this->method = 0; #буквы, которые выглядят одинаково + $this->is_flip = true; + $s = $this->_replace($word, $this->tt_correct); + if ($word !== $s && ! $this->_is_mixed($s)) $suggestions['tt0'] = $s; + + #2. английский --> русский: + $this->method = 1; #буквы в другой раскладке клавиатуры + $this->is_flip = true; + $s = $this->_replace($word, $this->tt_correct); + if ($word !== $s) $suggestions['tt1'] = $s; + + #3. русский --> английский: + $this->method = 0; #буквы, которые выглядят одинаково + $this->is_flip = false; + $s = $this->_replace($word, $this->en_correct); + if ($word !== $s && ! $this->_is_mixed($s)) $suggestions['en0'] = $s; + + #4. русский --> английский: + $this->method = 1; #буквы в другой раскладке клавиатуры + $this->is_flip = false; + $s = $this->_replace($word, $this->en_correct); + if ($word !== $s) $suggestions['en1'] = $s; + + } + #если найдено слово только из латинских букв; минимальная длина -- 4 буквы! + elseif (! empty($a[1]) && strlen($word) >= 4) + { + if (($this->mode & self::KEYBOARD_LAYOUT) === 0) return $word; + + #не обрабатываем аббревиатуры, пример: AMPAS + if (preg_match('/^(?:' . $this->en_uc . '|' . $this->sc . '){1,6}+$/sxSX', $word)) return $word; + + #английский --> русский: + $suggestions['en1'] = $word; + $suggestions['tt1'] = strtr($word, $this->table_flip[1]); + } + #если найдено слово только из русских букв; минимальная длина -- 4 буквы! + elseif (! empty($a[2]) && strlen($word) >= 8) + { + if (($this->mode & self::KEYBOARD_LAYOUT) === 0) return $word; + + #не обрабатываем аббревиатуры, пример: ДОСААФ + if (preg_match('/^(?:' . $this->tt_uc . '|' . $this->sc . '){1,6}+$/sxSX', $word)) return $word; + + #русский --> английский: + $suggestions['tt1'] = $word; + $suggestions['en1'] = strtr($word, $this->table[1]); + } + #найдены спецсимволы или длина слова слишком мала + else return $word; + + $suggestions = array_unique($suggestions); + #var_export($suggestions); + + $c = count($suggestions); + if ($c === 0) $s = $word; + else $s = $this->_detect($word, $suggestions, ! empty($a[3])); + if ($s !== $word) + { + $this->words[$word] = $s; + if ($this->mode >= (self::KEYBOARD_LAYOUT | self::ADD_FIX)) $s = '(' . $word . '=>' . $s . ')'; + } + return $s; + } + + private function _replace($word, $regexp) + { + do $word = preg_replace_callback($regexp, array(&$this, '_strtr'), $w = $word); + while ($w !== $word); + return $word; + } + + private function _strtr(array $a) + { + $word =& $a[0]; + return strtr($word, $this->is_flip ? $this->table_flip[$this->method] : $this->table[$this->method]); + } + + private function _is_mixed($word) + { + return preg_match('/(?:' . $this->en . ')/sxSX', $word) && + preg_match('/(?:' . $this->tt_f . ')/sxSX', $word); + } + + #выбираем из нескольких вариантов один + private function _detect($word, array $suggestions, $is_sc) + { + if (0) #DEBUG + { + //$suggestions['?'] = $word; + var_export($suggestions); + } + #не д. б. несуществующих N-грамм + foreach ($suggestions as $type => $w) + { + $lang = substr($type, 0, 2); + if ($this->_bigram_exists($w, $lang)) unset($suggestions[$type]); + } + if (0) #DEBUG + { + //$suggestions['?'] = $word; + var_export($suggestions); + } + if (count($suggestions) === 0) return $word; + + $s = end($suggestions); + + #если в $word были спецсимволы, а в $s их уже нет, возвращаем $s + if ($is_sc && ! preg_match('/' . $this->sc . '/sSX', $s)) return $s; + + #если в $s спецсимволов больше чем букв, возвращаем $word + $sc_count = 0; + $s = preg_replace('/' . $this->sc . '/sSX', '', $s, -1, $sc_count); + if ($sc_count > 0 && $sc_count > UTF8::strlen($s)) return $word; + + return reset($suggestions); + } + + #анализ на основе N-грамм русского и английского языка + private function _bigram_exists($word, $lang) + { + $word = ($lang === 'en') ? strtolower($word) : UTF8::lowercase($word); + + #шаг 0. + #проверяем слова в списке слов-исключений + if (array_key_exists($word, $this->words_exceptions[$lang])) return false; + + #шаг 1. + #проверка на 4 согласные буквы подряд; пример: больши{нств}о, юрисконсу{льтс}тво + if (preg_match('/(?:' . $this->consonant_lc[$lang] . '){4}/sxSX', $word, $m) + #проверяем список исключений + && ! array_key_exists($m[0], $this->consonants4_lc[$lang])) return true; + + #шаг 2. + #проверка на 3 гласные буквы подряд; пример: длиннош{еее}, зм{еео}бразный + if (preg_match('/(?:' . $this->vowel_lc[$lang] . '){3}/sxSX', $word, $m) + #проверяем список исключений + && ! array_key_exists($m[0], $this->vowels3_lc[$lang])) return true; + + #шаг 3. + $length = UTF8::strlen($word); + for ($pos = 0, $limit = $length - 1; $pos < $limit; $pos++) + { + /* + TODO Качество проверки по несуществующим биграммам можно немного повысить, + если учитывать не только начало и конец слова, но и все позиции биграмм в слове. + */ + $ss = UTF8::substr($word, $pos, 2); + if ($pos === 0) $ss = ' ' . $ss; #beginning of word + elseif ($pos === $limit - 1) $ss = $ss . ' '; #ending of word + if (array_key_exists($ss, $this->bigrams)) return true; + } + + return false; + } +} \ No newline at end of file diff --git a/upload/includes/class.reflection.php b/upload/includes/class.reflection.php new file mode 100644 index 000000000..c0f7839ce --- /dev/null +++ b/upload/includes/class.reflection.php @@ -0,0 +1,183 @@ + 'is_int', + 'integer' => 'is_int', + 'digit' => 'ctype_digit', + 'number' => 'ctype_digit', + 'float' => 'is_float', + 'double' => 'is_float', + 'real' => 'is_float', + 'numeric' => 'is_numeric', + 'str' => 'is_string', + 'string' => 'is_string', + 'char' => 'is_string', + 'bool' => 'is_bool', + 'boolean' => 'is_bool', + 'null' => 'is_null', + 'array' => 'is_array', + 'obj' => 'is_object', + 'object' => 'is_object', + 'res' => 'is_resource', + 'resource' => 'is_resource', + 'scalar' => 'is_scalar', #integer, float, string or boolean + 'cb' => 'is_callable', + 'callback' => 'is_callable', + ); + + #calling the methods of this class only statically! + private function __construct() {} + + public static function isValid() + { + if (! assert_options(ASSERT_ACTIVE)) return true; + $bt = self::debugBacktrace(null, 1); + extract($bt); //to $file, $line, $function, $class, $object, $type, $args + if (! $args) return true; #speed improve + $r = new ReflectionMethod($class, $function); + $doc = $r->getDocComment(); + $cache_id = $class. $type. $function; + preg_match_all('~ [\r\n]++ [\x20\t]++ \* [\x20\t]++ + @param + [\x20\t]++ + \K #memory reduce + ( [_a-z]++[_a-z\d]*+ + (?>[|/,][_a-z]+[_a-z\d]*)*+ + ) #1 types + [\x20\t]++ + &?+\$([_a-z]++[_a-z\d]*+) #2 name + ~sixSX', $doc, $params, PREG_SET_ORDER); + $parameters = $r->getParameters(); + //d($args, $params, $parameters); + if (count($parameters) > count($params)) + { + $message = 'phpDoc %d piece(s) @param description expected in %s%s%s(), %s given, ' . PHP_EOL + . 'called in %s on line %d ' . PHP_EOL + . 'and defined in %s on line %d'; + $message = sprintf($message, count($parameters), $class, $type, $function, count($params), $file, $line, $r->getFileName(), $r->getStartLine()); + trigger_error($message, E_USER_NOTICE); + } + foreach ($args as $i => $value) + { + if (! isset($params[$i])) return true; + if ($parameters[$i]->name !== $params[$i][2]) + { + $param_num = $i + 1; + $message = 'phpDoc @param %d in %s%s%s() must be named as $%s, $%s given, ' . PHP_EOL + . 'called in %s on line %d ' . PHP_EOL + . 'and defined in %s on line %d'; + $message = sprintf($message, $param_num, $class, $type, $function, $parameters[$i]->name, $params[$i][2], $file, $line, $r->getFileName(), $r->getStartLine()); + trigger_error($message, E_USER_NOTICE); + } + + $hints = preg_split('~[|/,]~sSX', $params[$i][1]); + if (! self::checkValueTypes($hints, $value)) + { + $param_num = $i + 1; + $message = 'Argument %d passed to %s%s%s() must be an %s, %s given, ' . PHP_EOL + . 'called in %s on line %d ' . PHP_EOL + . 'and defined in %s on line %d'; + $message = sprintf($message, $param_num, $class, $type, $function, implode('|', $hints), (is_object($value) ? get_class($value) . ' ' : '') . gettype($value), $file, $line, $r->getFileName(), $r->getStartLine()); + trigger_error($message, E_USER_WARNING); + return false; + } + } + return true; + } + + /** + * Return stacktrace. Correctly work with call_user_func*() + * (totally skip them correcting caller references). + * If $return_frame is present, return only $return_frame matched caller, not all stacktrace. + * + * @param string|null $re_ignore example: '~^' . preg_quote(__CLASS__, '~') . '(?![a-zA-Z\d])~sSX' + * @param int|null $return_frame + * @return array + */ + public static function debugBacktrace($re_ignore = null, $return_frame = null) + { + $trace = debug_backtrace(); + + $a = array(); + $frames = 0; + for ($i = 0, $n = count($trace); $i < $n; $i++) + { + $t = $trace[$i]; + if (! $t) continue; + + // Next frame. + $next = isset($trace[$i+1])? $trace[$i+1] : null; + + // Dummy frame before call_user_func*() frames. + if (! isset($t['file']) && $next) + { + $t['over_function'] = $trace[$i+1]['function']; + $t = $t + $trace[$i+1]; + $trace[$i+1] = null; // skip call_user_func on next iteration + } + + // Skip myself frame. + if (++$frames < 2) continue; + + // 'class' and 'function' field of next frame define where this frame function situated. + // Skip frames for functions situated in ignored places. + if ($re_ignore && $next) + { + // Name of function "inside which" frame was generated. + $frame_caller = (isset($next['class']) ? $next['class'] . $next['type'] : '') + . (isset($next['function']) ? $next['function'] : ''); + if (preg_match($re_ignore, $frame_caller)) continue; + } + + // On each iteration we consider ability to add PREVIOUS frame to $a stack. + if (count($a) === $return_frame) return $t; + $a[] = $t; + } + return $a; + } + + /** + * Checks a value to the allowed types + * + * @param array $types + * @param mixed $value + * @return bool + */ + public static function checkValueTypes(array $types, $value) + { + foreach ($types as $type) + { + $type = strtolower($type); + if (array_key_exists($type, self::$hints) && call_user_func(self::$hints[$type], $value)) return true; + if (is_object($value) && @is_a($value, $type)) return true; + if ($type === 'mixed') return true; + } + return false; + } +} \ No newline at end of file diff --git a/upload/includes/class.utf8.php b/upload/includes/class.utf8.php new file mode 100644 index 000000000..b2f1c5084 --- /dev/null +++ b/upload/includes/class.utf8.php @@ -0,0 +1,4072 @@ + = 5.3.x + * + * In Russian: + * + * Поддержка UTF-8 в PHP 5. + * + * Возможности и преимущества использования этого класса + * * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками + * * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются! + * * Полезные функции, отсутствующие в ICONV и MBSTRING + * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null (удобно при выборках значений из базы данных) + * * Несколько методов умеют обрабатывать массивы рекурсивно + * * Единый интерфейс и инкапсуляция (можно унаследоваться и переопределить методы) + * * Высокая производительность, надёжность и качественный код + * * PHP >= 5.3.x + * + * Example: + * $s = 'Hello, Привет'; + * if (UTF8::is_utf8($s)) echo UTF8::strlen($s); + * + * UTF-8 encoding scheme: + * 2^7 0x00000000 — 0x0000007F 0xxxxxxx + * 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx + * 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx + * 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864 + * + * If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-) + * + * Useful links + * http://ru.wikipedia.org/wiki/UTF8 + * http://www.madore.org/~david/misc/unitest/ A Unicode Test Page + * http://www.unicode.org/ + * http://www.unicode.org/reports/ + * http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm + * http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm + * http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode + * http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search + * + * @link http://code.google.com/p/php5-utf8/ + * @license http://creativecommons.org/licenses/by-sa/3.0/ + * @author Nasibullin Rinat + * @version 2.2.2 + */ +class UTF8 +{ + #REPLACEMENT CHARACTER (for broken char) + const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD + + /** + * Regular expression for a character in UTF-8 without the use of a flag /u + * @deprecated Instead, use a dot (".") and the flag /u, it works faster! + * @var string + */ + public static $char_re = ' [\x09\x0A\x0D\x20-\x7E] # ASCII strict + # [\x00-\x7F] # ASCII non-strict (including control chars) + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + '; + + /** + * Combining diactrical marks (Unicode 5.1). + * + * For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419), + * decomposed form: (U+0415 U+0308), (U+0418 U+0306) + * + * @link http://www.unicode.org/charts/PDF/U0300.pdf + * @link http://www.unicode.org/charts/PDF/U1DC0.pdf + * @link http://www.unicode.org/charts/PDF/UFE20.pdf + * @var string + */ + #public static $diactrical_re = '\p{M}'; #alternative, but only with /u flag + public static $diactrical_re = ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters) + | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols) + | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement) + | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks) + '; + + /** + * @var array + */ + public static $html_special_chars_table = array( + '"' => "\x22", #U+0022 ["] " quotation mark = APL quote + '&' => "\x26", #U+0026 [&] & ampersand + '<' => "\x3c", #U+003C [<] < less-than sign + '>' => "\x3e", #U+003E [>] > greater-than sign + ); + + /** + * @link http://www.fileformat.info/format/w3c/entitytest.htm?sort=Unicode%20Character HTML Entity Browser Test Page + * @var array + */ + public static $html_entity_table = array( + #Latin-1 Entities: + ' ' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space + '¡' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark + '¢' => "\xc2\xa2", #U+00A2 [¢] cent sign + '£' => "\xc2\xa3", #U+00A3 [£] pound sign + '¤' => "\xc2\xa4", #U+00A4 [¤] currency sign + '¥' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign + '¦' => "\xc2\xa6", #U+00A6 [¦] broken bar = broken vertical bar + '§' => "\xc2\xa7", #U+00A7 [§] section sign + '¨' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis + '©' => "\xc2\xa9", #U+00A9 [©] copyright sign + 'ª' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator + '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet + '¬' => "\xc2\xac", #U+00AC [¬] not sign + '­' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen + '®' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign + '¯' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar + '°' => "\xc2\xb0", #U+00B0 [°] degree sign + '±' => "\xc2\xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign + '²' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared + '³' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed + '´' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute + 'µ' => "\xc2\xb5", #U+00B5 [µ] micro sign + '¶' => "\xc2\xb6", #U+00B6 [¶] pilcrow sign = paragraph sign + '·' => "\xc2\xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot + '¸' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla + '¹' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one + 'º' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator + '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet + '¼' => "\xc2\xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter + '½' => "\xc2\xbd", #U+00BD [½] vulgar fraction one half = fraction one half + '¾' => "\xc2\xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters + '¿' => "\xc2\xbf", #U+00BF [¿] inverted question mark = turned question mark + #Latin capital letter + 'À' => "\xc3\x80", #Latin capital letter A with grave = Latin capital letter A grave + 'Á' => "\xc3\x81", #Latin capital letter A with acute + 'Â' => "\xc3\x82", #Latin capital letter A with circumflex + 'Ã' => "\xc3\x83", #Latin capital letter A with tilde + 'Ä' => "\xc3\x84", #Latin capital letter A with diaeresis + 'Å' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring + 'Æ' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE + 'Ç' => "\xc3\x87", #Latin capital letter C with cedilla + 'È' => "\xc3\x88", #Latin capital letter E with grave + 'É' => "\xc3\x89", #Latin capital letter E with acute + 'Ê' => "\xc3\x8a", #Latin capital letter E with circumflex + 'Ë' => "\xc3\x8b", #Latin capital letter E with diaeresis + 'Ì' => "\xc3\x8c", #Latin capital letter I with grave + 'Í' => "\xc3\x8d", #Latin capital letter I with acute + 'Î' => "\xc3\x8e", #Latin capital letter I with circumflex + 'Ï' => "\xc3\x8f", #Latin capital letter I with diaeresis + 'Ð' => "\xc3\x90", #Latin capital letter ETH + 'Ñ' => "\xc3\x91", #Latin capital letter N with tilde + 'Ò' => "\xc3\x92", #Latin capital letter O with grave + 'Ó' => "\xc3\x93", #Latin capital letter O with acute + 'Ô' => "\xc3\x94", #Latin capital letter O with circumflex + 'Õ' => "\xc3\x95", #Latin capital letter O with tilde + 'Ö' => "\xc3\x96", #Latin capital letter O with diaeresis + '×' => "\xc3\x97", #U+00D7 [×] multiplication sign + 'Ø' => "\xc3\x98", #Latin capital letter O with stroke = Latin capital letter O slash + 'Ù' => "\xc3\x99", #Latin capital letter U with grave + 'Ú' => "\xc3\x9a", #Latin capital letter U with acute + 'Û' => "\xc3\x9b", #Latin capital letter U with circumflex + 'Ü' => "\xc3\x9c", #Latin capital letter U with diaeresis + 'Ý' => "\xc3\x9d", #Latin capital letter Y with acute + 'Þ' => "\xc3\x9e", #Latin capital letter THORN + #Latin small letter + 'ß' => "\xc3\x9f", #Latin small letter sharp s = ess-zed + 'à' => "\xc3\xa0", #Latin small letter a with grave = Latin small letter a grave + 'á' => "\xc3\xa1", #Latin small letter a with acute + 'â' => "\xc3\xa2", #Latin small letter a with circumflex + 'ã' => "\xc3\xa3", #Latin small letter a with tilde + 'ä' => "\xc3\xa4", #Latin small letter a with diaeresis + 'å' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring + 'æ' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae + 'ç' => "\xc3\xa7", #Latin small letter c with cedilla + 'è' => "\xc3\xa8", #Latin small letter e with grave + 'é' => "\xc3\xa9", #Latin small letter e with acute + 'ê' => "\xc3\xaa", #Latin small letter e with circumflex + 'ë' => "\xc3\xab", #Latin small letter e with diaeresis + 'ì' => "\xc3\xac", #Latin small letter i with grave + 'í' => "\xc3\xad", #Latin small letter i with acute + 'î' => "\xc3\xae", #Latin small letter i with circumflex + 'ï' => "\xc3\xaf", #Latin small letter i with diaeresis + 'ð' => "\xc3\xb0", #Latin small letter eth + 'ñ' => "\xc3\xb1", #Latin small letter n with tilde + 'ò' => "\xc3\xb2", #Latin small letter o with grave + 'ó' => "\xc3\xb3", #Latin small letter o with acute + 'ô' => "\xc3\xb4", #Latin small letter o with circumflex + 'õ' => "\xc3\xb5", #Latin small letter o with tilde + 'ö' => "\xc3\xb6", #Latin small letter o with diaeresis + '÷' => "\xc3\xb7", #U+00F7 [÷] division sign + 'ø' => "\xc3\xb8", #Latin small letter o with stroke = Latin small letter o slash + 'ù' => "\xc3\xb9", #Latin small letter u with grave + 'ú' => "\xc3\xba", #Latin small letter u with acute + 'û' => "\xc3\xbb", #Latin small letter u with circumflex + 'ü' => "\xc3\xbc", #Latin small letter u with diaeresis + 'ý' => "\xc3\xbd", #Latin small letter y with acute + 'þ' => "\xc3\xbe", #Latin small letter thorn + 'ÿ' => "\xc3\xbf", #Latin small letter y with diaeresis + #Symbols and Greek Letters: + 'ƒ' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin + 'Α' => "\xce\x91", #Greek capital letter alpha + 'Β' => "\xce\x92", #Greek capital letter beta + 'Γ' => "\xce\x93", #Greek capital letter gamma + 'Δ' => "\xce\x94", #Greek capital letter delta + 'Ε' => "\xce\x95", #Greek capital letter epsilon + 'Ζ' => "\xce\x96", #Greek capital letter zeta + 'Η' => "\xce\x97", #Greek capital letter eta + 'Θ' => "\xce\x98", #Greek capital letter theta + 'Ι' => "\xce\x99", #Greek capital letter iota + 'Κ' => "\xce\x9a", #Greek capital letter kappa + 'Λ' => "\xce\x9b", #Greek capital letter lambda + 'Μ' => "\xce\x9c", #Greek capital letter mu + 'Ν' => "\xce\x9d", #Greek capital letter nu + 'Ξ' => "\xce\x9e", #Greek capital letter xi + 'Ο' => "\xce\x9f", #Greek capital letter omicron + 'Π' => "\xce\xa0", #Greek capital letter pi + 'Ρ' => "\xce\xa1", #Greek capital letter rho + 'Σ' => "\xce\xa3", #Greek capital letter sigma + 'Τ' => "\xce\xa4", #Greek capital letter tau + 'Υ' => "\xce\xa5", #Greek capital letter upsilon + 'Φ' => "\xce\xa6", #Greek capital letter phi + 'Χ' => "\xce\xa7", #Greek capital letter chi + 'Ψ' => "\xce\xa8", #Greek capital letter psi + 'Ω' => "\xce\xa9", #Greek capital letter omega + 'α' => "\xce\xb1", #Greek small letter alpha + 'β' => "\xce\xb2", #Greek small letter beta + 'γ' => "\xce\xb3", #Greek small letter gamma + 'δ' => "\xce\xb4", #Greek small letter delta + 'ε' => "\xce\xb5", #Greek small letter epsilon + 'ζ' => "\xce\xb6", #Greek small letter zeta + 'η' => "\xce\xb7", #Greek small letter eta + 'θ' => "\xce\xb8", #Greek small letter theta + 'ι' => "\xce\xb9", #Greek small letter iota + 'κ' => "\xce\xba", #Greek small letter kappa + 'λ' => "\xce\xbb", #Greek small letter lambda + 'μ' => "\xce\xbc", #Greek small letter mu + 'ν' => "\xce\xbd", #Greek small letter nu + 'ξ' => "\xce\xbe", #Greek small letter xi + 'ο' => "\xce\xbf", #Greek small letter omicron + 'π' => "\xcf\x80", #Greek small letter pi + 'ρ' => "\xcf\x81", #Greek small letter rho + 'ς' => "\xcf\x82", #Greek small letter final sigma + 'σ' => "\xcf\x83", #Greek small letter sigma + 'τ' => "\xcf\x84", #Greek small letter tau + 'υ' => "\xcf\x85", #Greek small letter upsilon + 'φ' => "\xcf\x86", #Greek small letter phi + 'χ' => "\xcf\x87", #Greek small letter chi + 'ψ' => "\xcf\x88", #Greek small letter psi + 'ω' => "\xcf\x89", #Greek small letter omega + 'ϑ'=> "\xcf\x91", #Greek small letter theta symbol + 'ϒ' => "\xcf\x92", #Greek upsilon with hook symbol + 'ϖ' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol + + '•' => "\xe2\x80\xa2", #U+2022 [•] bullet = black small circle + '…' => "\xe2\x80\xa6", #U+2026 […] horizontal ellipsis = three dot leader + '′' => "\xe2\x80\xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов) + '″' => "\xe2\x80\xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов). + '‾' => "\xe2\x80\xbe", #U+203E [‾] overline = spacing overscore + '⁄' => "\xe2\x81\x84", #U+2044 [⁄] fraction slash + '℘' => "\xe2\x84\x98", #U+2118 [℘] script capital P = power set = Weierstrass p + 'ℑ' => "\xe2\x84\x91", #U+2111 [ℑ] blackletter capital I = imaginary part + 'ℜ' => "\xe2\x84\x9c", #U+211C [ℜ] blackletter capital R = real part symbol + '™' => "\xe2\x84\xa2", #U+2122 [™] trade mark sign + 'ℵ' => "\xe2\x84\xb5", #U+2135 [ℵ] alef symbol = first transfinite cardinal + '←' => "\xe2\x86\x90", #U+2190 [←] leftwards arrow + '↑' => "\xe2\x86\x91", #U+2191 [↑] upwards arrow + '→' => "\xe2\x86\x92", #U+2192 [→] rightwards arrow + '↓' => "\xe2\x86\x93", #U+2193 [↓] downwards arrow + '↔' => "\xe2\x86\x94", #U+2194 [↔] left right arrow + '↵' => "\xe2\x86\xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return + '⇐' => "\xe2\x87\x90", #U+21D0 [⇐] leftwards double arrow + '⇑' => "\xe2\x87\x91", #U+21D1 [⇑] upwards double arrow + '⇒' => "\xe2\x87\x92", #U+21D2 [⇒] rightwards double arrow + '⇓' => "\xe2\x87\x93", #U+21D3 [⇓] downwards double arrow + '⇔' => "\xe2\x87\x94", #U+21D4 [⇔] left right double arrow + '∀' => "\xe2\x88\x80", #U+2200 [∀] for all + '∂' => "\xe2\x88\x82", #U+2202 [∂] partial differential + '∃' => "\xe2\x88\x83", #U+2203 [∃] there exists + '∅' => "\xe2\x88\x85", #U+2205 [∅] empty set = null set = diameter + '∇' => "\xe2\x88\x87", #U+2207 [∇] nabla = backward difference + '∈' => "\xe2\x88\x88", #U+2208 [∈] element of + '∉' => "\xe2\x88\x89", #U+2209 [∉] not an element of + '∋' => "\xe2\x88\x8b", #U+220B [∋] contains as member + '∏' => "\xe2\x88\x8f", #U+220F [∏] n-ary product = product sign + '∑' => "\xe2\x88\x91", #U+2211 [∑] n-ary sumation + '−' => "\xe2\x88\x92", #U+2212 [−] minus sign + '∗' => "\xe2\x88\x97", #U+2217 [∗] asterisk operator + '√' => "\xe2\x88\x9a", #U+221A [√] square root = radical sign + '∝' => "\xe2\x88\x9d", #U+221D [∝] proportional to + '∞' => "\xe2\x88\x9e", #U+221E [∞] infinity + '∠' => "\xe2\x88\xa0", #U+2220 [∠] angle + '∧' => "\xe2\x88\xa7", #U+2227 [∧] logical and = wedge + '∨' => "\xe2\x88\xa8", #U+2228 [∨] logical or = vee + '∩' => "\xe2\x88\xa9", #U+2229 [∩] intersection = cap + '∪' => "\xe2\x88\xaa", #U+222A [∪] union = cup + '∫' => "\xe2\x88\xab", #U+222B [∫] integral + '∴' => "\xe2\x88\xb4", #U+2234 [∴] therefore + '∼' => "\xe2\x88\xbc", #U+223C [∼] tilde operator = varies with = similar to + '≅' => "\xe2\x89\x85", #U+2245 [≅] approximately equal to + '≈' => "\xe2\x89\x88", #U+2248 [≈] almost equal to = asymptotic to + '≠' => "\xe2\x89\xa0", #U+2260 [≠] not equal to + '≡' => "\xe2\x89\xa1", #U+2261 [≡] identical to + '≤' => "\xe2\x89\xa4", #U+2264 [≤] less-than or equal to + '≥' => "\xe2\x89\xa5", #U+2265 [≥] greater-than or equal to + '⊂' => "\xe2\x8a\x82", #U+2282 [⊂] subset of + '⊃' => "\xe2\x8a\x83", #U+2283 [⊃] superset of + '⊄' => "\xe2\x8a\x84", #U+2284 [⊄] not a subset of + '⊆' => "\xe2\x8a\x86", #U+2286 [⊆] subset of or equal to + '⊇' => "\xe2\x8a\x87", #U+2287 [⊇] superset of or equal to + '⊕' => "\xe2\x8a\x95", #U+2295 [⊕] circled plus = direct sum + '⊗' => "\xe2\x8a\x97", #U+2297 [⊗] circled times = vector product + '⊥' => "\xe2\x8a\xa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular + '⋅' => "\xe2\x8b\x85", #U+22C5 [⋅] dot operator + '⌈' => "\xe2\x8c\x88", #U+2308 [⌈] left ceiling = APL upstile + '⌉' => "\xe2\x8c\x89", #U+2309 [⌉] right ceiling + '⌊' => "\xe2\x8c\x8a", #U+230A [⌊] left floor = APL downstile + '⌋' => "\xe2\x8c\x8b", #U+230B [⌋] right floor + '⟨' => "\xe2\x8c\xa9", #U+2329 [〈] left-pointing angle bracket = bra + '⟩' => "\xe2\x8c\xaa", #U+232A [〉] right-pointing angle bracket = ket + '◊' => "\xe2\x97\x8a", #U+25CA [◊] lozenge + '♠' => "\xe2\x99\xa0", #U+2660 [♠] black spade suit + '♣' => "\xe2\x99\xa3", #U+2663 [♣] black club suit = shamrock + '♥' => "\xe2\x99\xa5", #U+2665 [♥] black heart suit = valentine + '♦' => "\xe2\x99\xa6", #U+2666 [♦] black diamond suit + #Other Special Characters: + 'Œ' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE + 'œ' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe + 'Š' => "\xc5\xa0", #U+0160 [Š] Latin capital letter S with caron + 'š' => "\xc5\xa1", #U+0161 [š] Latin small letter s with caron + 'Ÿ' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis + 'ˆ' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent + '˜' => "\xcb\x9c", #U+02DC [˜] small tilde + ' ' => "\xe2\x80\x82", #U+2002 [ ] en space + ' ' => "\xe2\x80\x83", #U+2003 [ ] em space + ' ' => "\xe2\x80\x89", #U+2009 [ ] thin space + '‌' => "\xe2\x80\x8c", #U+200C [‌] zero width non-joiner + '‍' => "\xe2\x80\x8d", #U+200D [‍] zero width joiner + '‎' => "\xe2\x80\x8e", #U+200E [‎] left-to-right mark + '‏' => "\xe2\x80\x8f", #U+200F [‏] right-to-left mark + '–' => "\xe2\x80\x93", #U+2013 [–] en dash + '—' => "\xe2\x80\x94", #U+2014 [—] em dash + '‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark + '’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!) + '‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark + '“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark + '”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark + '„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark + '†' => "\xe2\x80\xa0", #U+2020 [†] dagger + '‡' => "\xe2\x80\xa1", #U+2021 [‡] double dagger + '‰' => "\xe2\x80\xb0", #U+2030 [‰] per mille sign + '‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark + '›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark + '€' => "\xe2\x82\xac", #U+20AC [€] euro sign + ); + + /** + * This table contains the data on how cp1259 characters map into Unicode (UTF-8). + * The cp1259 map describes standart tatarish cyrillic charset and based on the cp1251 table. + * cp1259 -- this is an outdated one byte encoding of the Tatar language, + * which includes all the Russian letters from cp1251. + * + * @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz + * @link http://www.unicode.org/charts/PDF/U0400.pdf + */ + public static $cp1259_table = array( + #bytes from 0x00 to 0x7F (ASCII) saved as is + "\x80" => "\xd3\x98", #U+04d8 CYRILLIC CAPITAL LETTER SCHWA + "\x81" => "\xd0\x83", #U+0403 CYRILLIC CAPITAL LETTER GJE + "\x82" => "\xe2\x80\x9a", #U+201a SINGLE LOW-9 QUOTATION MARK + "\x83" => "\xd1\x93", #U+0453 CYRILLIC SMALL LETTER GJE + "\x84" => "\xe2\x80\x9e", #U+201e DOUBLE LOW-9 QUOTATION MARK + "\x85" => "\xe2\x80\xa6", #U+2026 HORIZONTAL ELLIPSIS + "\x86" => "\xe2\x80\xa0", #U+2020 DAGGER + "\x87" => "\xe2\x80\xa1", #U+2021 DOUBLE DAGGER + "\x88" => "\xe2\x82\xac", #U+20ac EURO SIGN + "\x89" => "\xe2\x80\xb0", #U+2030 PER MILLE SIGN + "\x8a" => "\xd3\xa8", #U+04e8 CYRILLIC CAPITAL LETTER BARRED O + "\x8b" => "\xe2\x80\xb9", #U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + "\x8c" => "\xd2\xae", #U+04ae CYRILLIC CAPITAL LETTER STRAIGHT U + "\x8d" => "\xd2\x96", #U+0496 CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER + "\x8e" => "\xd2\xa2", #U+04a2 CYRILLIC CAPITAL LETTER EN WITH HOOK + "\x8f" => "\xd2\xba", #U+04ba CYRILLIC CAPITAL LETTER SHHA + "\x90" => "\xd3\x99", #U+04d9 CYRILLIC SMALL LETTER SCHWA + "\x91" => "\xe2\x80\x98", #U+2018 LEFT SINGLE QUOTATION MARK + "\x92" => "\xe2\x80\x99", #U+2019 RIGHT SINGLE QUOTATION MARK + "\x93" => "\xe2\x80\x9c", #U+201c LEFT DOUBLE QUOTATION MARK + "\x94" => "\xe2\x80\x9d", #U+201d RIGHT DOUBLE QUOTATION MARK + "\x95" => "\xe2\x80\xa2", #U+2022 BULLET + "\x96" => "\xe2\x80\x93", #U+2013 EN DASH + "\x97" => "\xe2\x80\x94", #U+2014 EM DASH + #"\x98" #UNDEFINED + "\x99" => "\xe2\x84\xa2", #U+2122 TRADE MARK SIGN + "\x9a" => "\xd3\xa9", #U+04e9 CYRILLIC SMALL LETTER BARRED O + "\x9b" => "\xe2\x80\xba", #U+203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + "\x9c" => "\xd2\xaf", #U+04af CYRILLIC SMALL LETTER STRAIGHT U + "\x9d" => "\xd2\x97", #U+0497 CYRILLIC SMALL LETTER ZHE WITH DESCENDER + "\x9e" => "\xd2\xa3", #U+04a3 CYRILLIC SMALL LETTER EN WITH HOOK + "\x9f" => "\xd2\xbb", #U+04bb CYRILLIC SMALL LETTER SHHA + "\xa0" => "\xc2\xa0", #U+00a0 NO-BREAK SPACE + "\xa1" => "\xd0\x8e", #U+040e CYRILLIC CAPITAL LETTER SHORT U + "\xa2" => "\xd1\x9e", #U+045e CYRILLIC SMALL LETTER SHORT U + "\xa3" => "\xd0\x88", #U+0408 CYRILLIC CAPITAL LETTER JE + "\xa4" => "\xc2\xa4", #U+00a4 CURRENCY SIGN + "\xa5" => "\xd2\x90", #U+0490 CYRILLIC CAPITAL LETTER GHE WITH UPTURN + "\xa6" => "\xc2\xa6", #U+00a6 BROKEN BAR + "\xa7" => "\xc2\xa7", #U+00a7 SECTION SIGN + "\xa8" => "\xd0\x81", #U+0401 CYRILLIC CAPITAL LETTER IO + "\xa9" => "\xc2\xa9", #U+00a9 COPYRIGHT SIGN + "\xaa" => "\xd0\x84", #U+0404 CYRILLIC CAPITAL LETTER UKRAINIAN IE + "\xab" => "\xc2\xab", #U+00ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + "\xac" => "\xc2\xac", #U+00ac NOT SIGN + "\xad" => "\xc2\xad", #U+00ad SOFT HYPHEN + "\xae" => "\xc2\xae", #U+00ae REGISTERED SIGN + "\xaf" => "\xd0\x87", #U+0407 CYRILLIC CAPITAL LETTER YI + "\xb0" => "\xc2\xb0", #U+00b0 DEGREE SIGN + "\xb1" => "\xc2\xb1", #U+00b1 PLUS-MINUS SIGN + "\xb2" => "\xd0\x86", #U+0406 CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I + "\xb3" => "\xd1\x96", #U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I + "\xb4" => "\xd2\x91", #U+0491 CYRILLIC SMALL LETTER GHE WITH UPTURN + "\xb5" => "\xc2\xb5", #U+00b5 MICRO SIGN + "\xb6" => "\xc2\xb6", #U+00b6 PILCROW SIGN + "\xb7" => "\xc2\xb7", #U+00b7 MIDDLE DOT + "\xb8" => "\xd1\x91", #U+0451 CYRILLIC SMALL LETTER IO + "\xb9" => "\xe2\x84\x96", #U+2116 NUMERO SIGN + "\xba" => "\xd1\x94", #U+0454 CYRILLIC SMALL LETTER UKRAINIAN IE + "\xbb" => "\xc2\xbb", #U+00bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + "\xbc" => "\xd1\x98", #U+0458 CYRILLIC SMALL LETTER JE + "\xbd" => "\xd0\x85", #U+0405 CYRILLIC CAPITAL LETTER DZE + "\xbe" => "\xd1\x95", #U+0455 CYRILLIC SMALL LETTER DZE + "\xbf" => "\xd1\x97", #U+0457 CYRILLIC SMALL LETTER YI + "\xc0" => "\xd0\x90", #U+0410 CYRILLIC CAPITAL LETTER A + "\xc1" => "\xd0\x91", #U+0411 CYRILLIC CAPITAL LETTER BE + "\xc2" => "\xd0\x92", #U+0412 CYRILLIC CAPITAL LETTER VE + "\xc3" => "\xd0\x93", #U+0413 CYRILLIC CAPITAL LETTER GHE + "\xc4" => "\xd0\x94", #U+0414 CYRILLIC CAPITAL LETTER DE + "\xc5" => "\xd0\x95", #U+0415 CYRILLIC CAPITAL LETTER IE + "\xc6" => "\xd0\x96", #U+0416 CYRILLIC CAPITAL LETTER ZHE + "\xc7" => "\xd0\x97", #U+0417 CYRILLIC CAPITAL LETTER ZE + "\xc8" => "\xd0\x98", #U+0418 CYRILLIC CAPITAL LETTER I + "\xc9" => "\xd0\x99", #U+0419 CYRILLIC CAPITAL LETTER SHORT I + "\xca" => "\xd0\x9a", #U+041a CYRILLIC CAPITAL LETTER KA + "\xcb" => "\xd0\x9b", #U+041b CYRILLIC CAPITAL LETTER EL + "\xcc" => "\xd0\x9c", #U+041c CYRILLIC CAPITAL LETTER EM + "\xcd" => "\xd0\x9d", #U+041d CYRILLIC CAPITAL LETTER EN + "\xce" => "\xd0\x9e", #U+041e CYRILLIC CAPITAL LETTER O + "\xcf" => "\xd0\x9f", #U+041f CYRILLIC CAPITAL LETTER PE + "\xd0" => "\xd0\xa0", #U+0420 CYRILLIC CAPITAL LETTER ER + "\xd1" => "\xd0\xa1", #U+0421 CYRILLIC CAPITAL LETTER ES + "\xd2" => "\xd0\xa2", #U+0422 CYRILLIC CAPITAL LETTER TE + "\xd3" => "\xd0\xa3", #U+0423 CYRILLIC CAPITAL LETTER U + "\xd4" => "\xd0\xa4", #U+0424 CYRILLIC CAPITAL LETTER EF + "\xd5" => "\xd0\xa5", #U+0425 CYRILLIC CAPITAL LETTER HA + "\xd6" => "\xd0\xa6", #U+0426 CYRILLIC CAPITAL LETTER TSE + "\xd7" => "\xd0\xa7", #U+0427 CYRILLIC CAPITAL LETTER CHE + "\xd8" => "\xd0\xa8", #U+0428 CYRILLIC CAPITAL LETTER SHA + "\xd9" => "\xd0\xa9", #U+0429 CYRILLIC CAPITAL LETTER SHCHA + "\xda" => "\xd0\xaa", #U+042a CYRILLIC CAPITAL LETTER HARD SIGN + "\xdb" => "\xd0\xab", #U+042b CYRILLIC CAPITAL LETTER YERU + "\xdc" => "\xd0\xac", #U+042c CYRILLIC CAPITAL LETTER SOFT SIGN + "\xdd" => "\xd0\xad", #U+042d CYRILLIC CAPITAL LETTER E + "\xde" => "\xd0\xae", #U+042e CYRILLIC CAPITAL LETTER YU + "\xdf" => "\xd0\xaf", #U+042f CYRILLIC CAPITAL LETTER YA + "\xe0" => "\xd0\xb0", #U+0430 CYRILLIC SMALL LETTER A + "\xe1" => "\xd0\xb1", #U+0431 CYRILLIC SMALL LETTER BE + "\xe2" => "\xd0\xb2", #U+0432 CYRILLIC SMALL LETTER VE + "\xe3" => "\xd0\xb3", #U+0433 CYRILLIC SMALL LETTER GHE + "\xe4" => "\xd0\xb4", #U+0434 CYRILLIC SMALL LETTER DE + "\xe5" => "\xd0\xb5", #U+0435 CYRILLIC SMALL LETTER IE + "\xe6" => "\xd0\xb6", #U+0436 CYRILLIC SMALL LETTER ZHE + "\xe7" => "\xd0\xb7", #U+0437 CYRILLIC SMALL LETTER ZE + "\xe8" => "\xd0\xb8", #U+0438 CYRILLIC SMALL LETTER I + "\xe9" => "\xd0\xb9", #U+0439 CYRILLIC SMALL LETTER SHORT I + "\xea" => "\xd0\xba", #U+043a CYRILLIC SMALL LETTER KA + "\xeb" => "\xd0\xbb", #U+043b CYRILLIC SMALL LETTER EL + "\xec" => "\xd0\xbc", #U+043c CYRILLIC SMALL LETTER EM + "\xed" => "\xd0\xbd", #U+043d CYRILLIC SMALL LETTER EN + "\xee" => "\xd0\xbe", #U+043e CYRILLIC SMALL LETTER O + "\xef" => "\xd0\xbf", #U+043f CYRILLIC SMALL LETTER PE + "\xf0" => "\xd1\x80", #U+0440 CYRILLIC SMALL LETTER ER + "\xf1" => "\xd1\x81", #U+0441 CYRILLIC SMALL LETTER ES + "\xf2" => "\xd1\x82", #U+0442 CYRILLIC SMALL LETTER TE + "\xf3" => "\xd1\x83", #U+0443 CYRILLIC SMALL LETTER U + "\xf4" => "\xd1\x84", #U+0444 CYRILLIC SMALL LETTER EF + "\xf5" => "\xd1\x85", #U+0445 CYRILLIC SMALL LETTER HA + "\xf6" => "\xd1\x86", #U+0446 CYRILLIC SMALL LETTER TSE + "\xf7" => "\xd1\x87", #U+0447 CYRILLIC SMALL LETTER CHE + "\xf8" => "\xd1\x88", #U+0448 CYRILLIC SMALL LETTER SHA + "\xf9" => "\xd1\x89", #U+0449 CYRILLIC SMALL LETTER SHCHA + "\xfa" => "\xd1\x8a", #U+044a CYRILLIC SMALL LETTER HARD SIGN + "\xfb" => "\xd1\x8b", #U+044b CYRILLIC SMALL LETTER YERU + "\xfc" => "\xd1\x8c", #U+044c CYRILLIC SMALL LETTER SOFT SIGN + "\xfd" => "\xd1\x8d", #U+044d CYRILLIC SMALL LETTER E + "\xfe" => "\xd1\x8e", #U+044e CYRILLIC SMALL LETTER YU + "\xff" => "\xd1\x8f", #U+044f CYRILLIC SMALL LETTER YA + ); + + /** + * UTF-8 Case lookup table + * + * This lookuptable defines the upper case letters to their correspponding + * lower case letter in UTF-8 + * + * @author Andreas Gohr + */ + public static $convert_case_table = array( + #CASE_UPPER => case_lower + "\x41" => "\x61", #A a + "\x42" => "\x62", #B b + "\x43" => "\x63", #C c + "\x44" => "\x64", #D d + "\x45" => "\x65", #E e + "\x46" => "\x66", #F f + "\x47" => "\x67", #G g + "\x48" => "\x68", #H h + "\x49" => "\x69", #I i + "\x4a" => "\x6a", #J j + "\x4b" => "\x6b", #K k + "\x4c" => "\x6c", #L l + "\x4d" => "\x6d", #M m + "\x4e" => "\x6e", #N n + "\x4f" => "\x6f", #O o + "\x50" => "\x70", #P p + "\x51" => "\x71", #Q q + "\x52" => "\x72", #R r + "\x53" => "\x73", #S s + "\x54" => "\x74", #T t + "\x55" => "\x75", #U u + "\x56" => "\x76", #V v + "\x57" => "\x77", #W w + "\x58" => "\x78", #X x + "\x59" => "\x79", #Y y + "\x5a" => "\x7a", #Z z + "\xc3\x80" => "\xc3\xa0", + "\xc3\x81" => "\xc3\xa1", + "\xc3\x82" => "\xc3\xa2", + "\xc3\x83" => "\xc3\xa3", + "\xc3\x84" => "\xc3\xa4", + "\xc3\x85" => "\xc3\xa5", + "\xc3\x86" => "\xc3\xa6", + "\xc3\x87" => "\xc3\xa7", + "\xc3\x88" => "\xc3\xa8", + "\xc3\x89" => "\xc3\xa9", + "\xc3\x8a" => "\xc3\xaa", + "\xc3\x8b" => "\xc3\xab", + "\xc3\x8c" => "\xc3\xac", + "\xc3\x8d" => "\xc3\xad", + "\xc3\x8e" => "\xc3\xae", + "\xc3\x8f" => "\xc3\xaf", + "\xc3\x90" => "\xc3\xb0", + "\xc3\x91" => "\xc3\xb1", + "\xc3\x92" => "\xc3\xb2", + "\xc3\x93" => "\xc3\xb3", + "\xc3\x94" => "\xc3\xb4", + "\xc3\x95" => "\xc3\xb5", + "\xc3\x96" => "\xc3\xb6", + "\xc3\x98" => "\xc3\xb8", + "\xc3\x99" => "\xc3\xb9", + "\xc3\x9a" => "\xc3\xba", + "\xc3\x9b" => "\xc3\xbb", + "\xc3\x9c" => "\xc3\xbc", + "\xc3\x9d" => "\xc3\xbd", + "\xc3\x9e" => "\xc3\xbe", + "\xc4\x80" => "\xc4\x81", + "\xc4\x82" => "\xc4\x83", + "\xc4\x84" => "\xc4\x85", + "\xc4\x86" => "\xc4\x87", + "\xc4\x88" => "\xc4\x89", + "\xc4\x8a" => "\xc4\x8b", + "\xc4\x8c" => "\xc4\x8d", + "\xc4\x8e" => "\xc4\x8f", + "\xc4\x90" => "\xc4\x91", + "\xc4\x92" => "\xc4\x93", + "\xc4\x94" => "\xc4\x95", + "\xc4\x96" => "\xc4\x97", + "\xc4\x98" => "\xc4\x99", + "\xc4\x9a" => "\xc4\x9b", + "\xc4\x9c" => "\xc4\x9d", + "\xc4\x9e" => "\xc4\x9f", + "\xc4\xa0" => "\xc4\xa1", + "\xc4\xa2" => "\xc4\xa3", + "\xc4\xa4" => "\xc4\xa5", + "\xc4\xa6" => "\xc4\xa7", + "\xc4\xa8" => "\xc4\xa9", + "\xc4\xaa" => "\xc4\xab", + "\xc4\xac" => "\xc4\xad", + "\xc4\xae" => "\xc4\xaf", + "\xc4\xb2" => "\xc4\xb3", + "\xc4\xb4" => "\xc4\xb5", + "\xc4\xb6" => "\xc4\xb7", + "\xc4\xb9" => "\xc4\xba", + "\xc4\xbb" => "\xc4\xbc", + "\xc4\xbd" => "\xc4\xbe", + "\xc4\xbf" => "\xc5\x80", + "\xc5\x81" => "\xc5\x82", + "\xc5\x83" => "\xc5\x84", + "\xc5\x85" => "\xc5\x86", + "\xc5\x87" => "\xc5\x88", + "\xc5\x8a" => "\xc5\x8b", + "\xc5\x8c" => "\xc5\x8d", + "\xc5\x8e" => "\xc5\x8f", + "\xc5\x90" => "\xc5\x91", + "\xc5\x92" => "\xc5\x93", + "\xc5\x94" => "\xc5\x95", + "\xc5\x96" => "\xc5\x97", + "\xc5\x98" => "\xc5\x99", + "\xc5\x9a" => "\xc5\x9b", + "\xc5\x9c" => "\xc5\x9d", + "\xc5\x9e" => "\xc5\x9f", + "\xc5\xa0" => "\xc5\xa1", + "\xc5\xa2" => "\xc5\xa3", + "\xc5\xa4" => "\xc5\xa5", + "\xc5\xa6" => "\xc5\xa7", + "\xc5\xa8" => "\xc5\xa9", + "\xc5\xaa" => "\xc5\xab", + "\xc5\xac" => "\xc5\xad", + "\xc5\xae" => "\xc5\xaf", + "\xc5\xb0" => "\xc5\xb1", + "\xc5\xb2" => "\xc5\xb3", + "\xc5\xb4" => "\xc5\xb5", + "\xc5\xb6" => "\xc5\xb7", + "\xc5\xb8" => "\xc3\xbf", + "\xc5\xb9" => "\xc5\xba", + "\xc5\xbb" => "\xc5\xbc", + "\xc5\xbd" => "\xc5\xbe", + "\xc6\x81" => "\xc9\x93", + "\xc6\x82" => "\xc6\x83", + "\xc6\x84" => "\xc6\x85", + "\xc6\x86" => "\xc9\x94", + "\xc6\x87" => "\xc6\x88", + "\xc6\x89" => "\xc9\x96", + "\xc6\x8a" => "\xc9\x97", + "\xc6\x8b" => "\xc6\x8c", + "\xc6\x8e" => "\xc7\x9d", + "\xc6\x8f" => "\xc9\x99", + "\xc6\x90" => "\xc9\x9b", + "\xc6\x91" => "\xc6\x92", + "\xc6\x94" => "\xc9\xa3", + "\xc6\x96" => "\xc9\xa9", + "\xc6\x97" => "\xc9\xa8", + "\xc6\x98" => "\xc6\x99", + "\xc6\x9c" => "\xc9\xaf", + "\xc6\x9d" => "\xc9\xb2", + "\xc6\x9f" => "\xc9\xb5", + "\xc6\xa0" => "\xc6\xa1", + "\xc6\xa2" => "\xc6\xa3", + "\xc6\xa4" => "\xc6\xa5", + "\xc6\xa6" => "\xca\x80", + "\xc6\xa7" => "\xc6\xa8", + "\xc6\xa9" => "\xca\x83", + "\xc6\xac" => "\xc6\xad", + "\xc6\xae" => "\xca\x88", + "\xc6\xaf" => "\xc6\xb0", + "\xc6\xb1" => "\xca\x8a", + "\xc6\xb2" => "\xca\x8b", + "\xc6\xb3" => "\xc6\xb4", + "\xc6\xb5" => "\xc6\xb6", + "\xc6\xb7" => "\xca\x92", + "\xc6\xb8" => "\xc6\xb9", + "\xc6\xbc" => "\xc6\xbd", + "\xc7\x85" => "\xc7\x86", + "\xc7\x88" => "\xc7\x89", + "\xc7\x8b" => "\xc7\x8c", + "\xc7\x8d" => "\xc7\x8e", + "\xc7\x8f" => "\xc7\x90", + "\xc7\x91" => "\xc7\x92", + "\xc7\x93" => "\xc7\x94", + "\xc7\x95" => "\xc7\x96", + "\xc7\x97" => "\xc7\x98", + "\xc7\x99" => "\xc7\x9a", + "\xc7\x9b" => "\xc7\x9c", + "\xc7\x9e" => "\xc7\x9f", + "\xc7\xa0" => "\xc7\xa1", + "\xc7\xa2" => "\xc7\xa3", + "\xc7\xa4" => "\xc7\xa5", + "\xc7\xa6" => "\xc7\xa7", + "\xc7\xa8" => "\xc7\xa9", + "\xc7\xaa" => "\xc7\xab", + "\xc7\xac" => "\xc7\xad", + "\xc7\xae" => "\xc7\xaf", + "\xc7\xb2" => "\xc7\xb3", + "\xc7\xb4" => "\xc7\xb5", + "\xc7\xb6" => "\xc6\x95", + "\xc7\xb7" => "\xc6\xbf", + "\xc7\xb8" => "\xc7\xb9", + "\xc7\xba" => "\xc7\xbb", + "\xc7\xbc" => "\xc7\xbd", + "\xc7\xbe" => "\xc7\xbf", + "\xc8\x80" => "\xc8\x81", + "\xc8\x82" => "\xc8\x83", + "\xc8\x84" => "\xc8\x85", + "\xc8\x86" => "\xc8\x87", + "\xc8\x88" => "\xc8\x89", + "\xc8\x8a" => "\xc8\x8b", + "\xc8\x8c" => "\xc8\x8d", + "\xc8\x8e" => "\xc8\x8f", + "\xc8\x90" => "\xc8\x91", + "\xc8\x92" => "\xc8\x93", + "\xc8\x94" => "\xc8\x95", + "\xc8\x96" => "\xc8\x97", + "\xc8\x98" => "\xc8\x99", + "\xc8\x9a" => "\xc8\x9b", + "\xc8\x9c" => "\xc8\x9d", + "\xc8\x9e" => "\xc8\x9f", + "\xc8\xa0" => "\xc6\x9e", + "\xc8\xa2" => "\xc8\xa3", + "\xc8\xa4" => "\xc8\xa5", + "\xc8\xa6" => "\xc8\xa7", + "\xc8\xa8" => "\xc8\xa9", + "\xc8\xaa" => "\xc8\xab", + "\xc8\xac" => "\xc8\xad", + "\xc8\xae" => "\xc8\xaf", + "\xc8\xb0" => "\xc8\xb1", + "\xc8\xb2" => "\xc8\xb3", + "\xce\x86" => "\xce\xac", + "\xce\x88" => "\xce\xad", + "\xce\x89" => "\xce\xae", + "\xce\x8a" => "\xce\xaf", + "\xce\x8c" => "\xcf\x8c", + "\xce\x8e" => "\xcf\x8d", + "\xce\x8f" => "\xcf\x8e", + "\xce\x91" => "\xce\xb1", + "\xce\x92" => "\xce\xb2", + "\xce\x93" => "\xce\xb3", + "\xce\x94" => "\xce\xb4", + "\xce\x95" => "\xce\xb5", + "\xce\x96" => "\xce\xb6", + "\xce\x97" => "\xce\xb7", + "\xce\x98" => "\xce\xb8", + "\xce\x99" => "\xce\xb9", + "\xce\x9a" => "\xce\xba", + "\xce\x9b" => "\xce\xbb", + "\xce\x9c" => "\xc2\xb5", + "\xce\x9d" => "\xce\xbd", + "\xce\x9e" => "\xce\xbe", + "\xce\x9f" => "\xce\xbf", + "\xce\xa0" => "\xcf\x80", + "\xce\xa1" => "\xcf\x81", + "\xce\xa3" => "\xcf\x82", + "\xce\xa4" => "\xcf\x84", + "\xce\xa5" => "\xcf\x85", + "\xce\xa6" => "\xcf\x86", + "\xce\xa7" => "\xcf\x87", + "\xce\xa8" => "\xcf\x88", + "\xce\xa9" => "\xcf\x89", + "\xce\xaa" => "\xcf\x8a", + "\xce\xab" => "\xcf\x8b", + "\xcf\x98" => "\xcf\x99", + "\xcf\x9a" => "\xcf\x9b", + "\xcf\x9c" => "\xcf\x9d", + "\xcf\x9e" => "\xcf\x9f", + "\xcf\xa0" => "\xcf\xa1", + "\xcf\xa2" => "\xcf\xa3", + "\xcf\xa4" => "\xcf\xa5", + "\xcf\xa6" => "\xcf\xa7", + "\xcf\xa8" => "\xcf\xa9", + "\xcf\xaa" => "\xcf\xab", + "\xcf\xac" => "\xcf\xad", + "\xcf\xae" => "\xcf\xaf", + "\xd0\x80" => "\xd1\x90", + "\xd0\x81" => "\xd1\x91", + "\xd0\x82" => "\xd1\x92", + "\xd0\x83" => "\xd1\x93", + "\xd0\x84" => "\xd1\x94", + "\xd0\x85" => "\xd1\x95", + "\xd0\x86" => "\xd1\x96", + "\xd0\x87" => "\xd1\x97", + "\xd0\x88" => "\xd1\x98", + "\xd0\x89" => "\xd1\x99", + "\xd0\x8a" => "\xd1\x9a", + "\xd0\x8b" => "\xd1\x9b", + "\xd0\x8c" => "\xd1\x9c", + "\xd0\x8d" => "\xd1\x9d", + "\xd0\x8e" => "\xd1\x9e", + "\xd0\x8f" => "\xd1\x9f", + "\xd0\x90" => "\xd0\xb0", + "\xd0\x91" => "\xd0\xb1", + "\xd0\x92" => "\xd0\xb2", + "\xd0\x93" => "\xd0\xb3", + "\xd0\x94" => "\xd0\xb4", + "\xd0\x95" => "\xd0\xb5", + "\xd0\x96" => "\xd0\xb6", + "\xd0\x97" => "\xd0\xb7", + "\xd0\x98" => "\xd0\xb8", + "\xd0\x99" => "\xd0\xb9", + "\xd0\x9a" => "\xd0\xba", + "\xd0\x9b" => "\xd0\xbb", + "\xd0\x9c" => "\xd0\xbc", + "\xd0\x9d" => "\xd0\xbd", + "\xd0\x9e" => "\xd0\xbe", + "\xd0\x9f" => "\xd0\xbf", + "\xd0\xa0" => "\xd1\x80", + "\xd0\xa1" => "\xd1\x81", + "\xd0\xa2" => "\xd1\x82", + "\xd0\xa3" => "\xd1\x83", + "\xd0\xa4" => "\xd1\x84", + "\xd0\xa5" => "\xd1\x85", + "\xd0\xa6" => "\xd1\x86", + "\xd0\xa7" => "\xd1\x87", + "\xd0\xa8" => "\xd1\x88", + "\xd0\xa9" => "\xd1\x89", + "\xd0\xaa" => "\xd1\x8a", + "\xd0\xab" => "\xd1\x8b", + "\xd0\xac" => "\xd1\x8c", + "\xd0\xad" => "\xd1\x8d", + "\xd0\xae" => "\xd1\x8e", + "\xd0\xaf" => "\xd1\x8f", + "\xd1\xa0" => "\xd1\xa1", + "\xd1\xa2" => "\xd1\xa3", + "\xd1\xa4" => "\xd1\xa5", + "\xd1\xa6" => "\xd1\xa7", + "\xd1\xa8" => "\xd1\xa9", + "\xd1\xaa" => "\xd1\xab", + "\xd1\xac" => "\xd1\xad", + "\xd1\xae" => "\xd1\xaf", + "\xd1\xb0" => "\xd1\xb1", + "\xd1\xb2" => "\xd1\xb3", + "\xd1\xb4" => "\xd1\xb5", + "\xd1\xb6" => "\xd1\xb7", + "\xd1\xb8" => "\xd1\xb9", + "\xd1\xba" => "\xd1\xbb", + "\xd1\xbc" => "\xd1\xbd", + "\xd1\xbe" => "\xd1\xbf", + "\xd2\x80" => "\xd2\x81", + "\xd2\x8a" => "\xd2\x8b", + "\xd2\x8c" => "\xd2\x8d", + "\xd2\x8e" => "\xd2\x8f", + "\xd2\x90" => "\xd2\x91", + "\xd2\x92" => "\xd2\x93", + "\xd2\x94" => "\xd2\x95", + "\xd2\x96" => "\xd2\x97", + "\xd2\x98" => "\xd2\x99", + "\xd2\x9a" => "\xd2\x9b", + "\xd2\x9c" => "\xd2\x9d", + "\xd2\x9e" => "\xd2\x9f", + "\xd2\xa0" => "\xd2\xa1", + "\xd2\xa2" => "\xd2\xa3", + "\xd2\xa4" => "\xd2\xa5", + "\xd2\xa6" => "\xd2\xa7", + "\xd2\xa8" => "\xd2\xa9", + "\xd2\xaa" => "\xd2\xab", + "\xd2\xac" => "\xd2\xad", + "\xd2\xae" => "\xd2\xaf", + "\xd2\xb0" => "\xd2\xb1", + "\xd2\xb2" => "\xd2\xb3", + "\xd2\xb4" => "\xd2\xb5", + "\xd2\xb6" => "\xd2\xb7", + "\xd2\xb8" => "\xd2\xb9", + "\xd2\xba" => "\xd2\xbb", + "\xd2\xbc" => "\xd2\xbd", + "\xd2\xbe" => "\xd2\xbf", + "\xd3\x81" => "\xd3\x82", + "\xd3\x83" => "\xd3\x84", + "\xd3\x85" => "\xd3\x86", + "\xd3\x87" => "\xd3\x88", + "\xd3\x89" => "\xd3\x8a", + "\xd3\x8b" => "\xd3\x8c", + "\xd3\x8d" => "\xd3\x8e", + "\xd3\x90" => "\xd3\x91", + "\xd3\x92" => "\xd3\x93", + "\xd3\x94" => "\xd3\x95", + "\xd3\x96" => "\xd3\x97", + "\xd3\x98" => "\xd3\x99", + "\xd3\x9a" => "\xd3\x9b", + "\xd3\x9c" => "\xd3\x9d", + "\xd3\x9e" => "\xd3\x9f", + "\xd3\xa0" => "\xd3\xa1", + "\xd3\xa2" => "\xd3\xa3", + "\xd3\xa4" => "\xd3\xa5", + "\xd3\xa6" => "\xd3\xa7", + "\xd3\xa8" => "\xd3\xa9", + "\xd3\xaa" => "\xd3\xab", + "\xd3\xac" => "\xd3\xad", + "\xd3\xae" => "\xd3\xaf", + "\xd3\xb0" => "\xd3\xb1", + "\xd3\xb2" => "\xd3\xb3", + "\xd3\xb4" => "\xd3\xb5", + "\xd3\xb8" => "\xd3\xb9", + "\xd4\x80" => "\xd4\x81", + "\xd4\x82" => "\xd4\x83", + "\xd4\x84" => "\xd4\x85", + "\xd4\x86" => "\xd4\x87", + "\xd4\x88" => "\xd4\x89", + "\xd4\x8a" => "\xd4\x8b", + "\xd4\x8c" => "\xd4\x8d", + "\xd4\x8e" => "\xd4\x8f", + "\xd4\xb1" => "\xd5\xa1", + "\xd4\xb2" => "\xd5\xa2", + "\xd4\xb3" => "\xd5\xa3", + "\xd4\xb4" => "\xd5\xa4", + "\xd4\xb5" => "\xd5\xa5", + "\xd4\xb6" => "\xd5\xa6", + "\xd4\xb7" => "\xd5\xa7", + "\xd4\xb8" => "\xd5\xa8", + "\xd4\xb9" => "\xd5\xa9", + "\xd4\xba" => "\xd5\xaa", + "\xd4\xbb" => "\xd5\xab", + "\xd4\xbc" => "\xd5\xac", + "\xd4\xbd" => "\xd5\xad", + "\xd4\xbe" => "\xd5\xae", + "\xd4\xbf" => "\xd5\xaf", + "\xd5\x80" => "\xd5\xb0", + "\xd5\x81" => "\xd5\xb1", + "\xd5\x82" => "\xd5\xb2", + "\xd5\x83" => "\xd5\xb3", + "\xd5\x84" => "\xd5\xb4", + "\xd5\x85" => "\xd5\xb5", + "\xd5\x86" => "\xd5\xb6", + "\xd5\x87" => "\xd5\xb7", + "\xd5\x88" => "\xd5\xb8", + "\xd5\x89" => "\xd5\xb9", + "\xd5\x8a" => "\xd5\xba", + "\xd5\x8b" => "\xd5\xbb", + "\xd5\x8c" => "\xd5\xbc", + "\xd5\x8d" => "\xd5\xbd", + "\xd5\x8e" => "\xd5\xbe", + "\xd5\x8f" => "\xd5\xbf", + "\xd5\x90" => "\xd6\x80", + "\xd5\x91" => "\xd6\x81", + "\xd5\x92" => "\xd6\x82", + "\xd5\x93" => "\xd6\x83", + "\xd5\x94" => "\xd6\x84", + "\xd5\x95" => "\xd6\x85", + "\xd5\x96" => "\xd6\x86", + "\xe1\xb8\x80" => "\xe1\xb8\x81", + "\xe1\xb8\x82" => "\xe1\xb8\x83", + "\xe1\xb8\x84" => "\xe1\xb8\x85", + "\xe1\xb8\x86" => "\xe1\xb8\x87", + "\xe1\xb8\x88" => "\xe1\xb8\x89", + "\xe1\xb8\x8a" => "\xe1\xb8\x8b", + "\xe1\xb8\x8c" => "\xe1\xb8\x8d", + "\xe1\xb8\x8e" => "\xe1\xb8\x8f", + "\xe1\xb8\x90" => "\xe1\xb8\x91", + "\xe1\xb8\x92" => "\xe1\xb8\x93", + "\xe1\xb8\x94" => "\xe1\xb8\x95", + "\xe1\xb8\x96" => "\xe1\xb8\x97", + "\xe1\xb8\x98" => "\xe1\xb8\x99", + "\xe1\xb8\x9a" => "\xe1\xb8\x9b", + "\xe1\xb8\x9c" => "\xe1\xb8\x9d", + "\xe1\xb8\x9e" => "\xe1\xb8\x9f", + "\xe1\xb8\xa0" => "\xe1\xb8\xa1", + "\xe1\xb8\xa2" => "\xe1\xb8\xa3", + "\xe1\xb8\xa4" => "\xe1\xb8\xa5", + "\xe1\xb8\xa6" => "\xe1\xb8\xa7", + "\xe1\xb8\xa8" => "\xe1\xb8\xa9", + "\xe1\xb8\xaa" => "\xe1\xb8\xab", + "\xe1\xb8\xac" => "\xe1\xb8\xad", + "\xe1\xb8\xae" => "\xe1\xb8\xaf", + "\xe1\xb8\xb0" => "\xe1\xb8\xb1", + "\xe1\xb8\xb2" => "\xe1\xb8\xb3", + "\xe1\xb8\xb4" => "\xe1\xb8\xb5", + "\xe1\xb8\xb6" => "\xe1\xb8\xb7", + "\xe1\xb8\xb8" => "\xe1\xb8\xb9", + "\xe1\xb8\xba" => "\xe1\xb8\xbb", + "\xe1\xb8\xbc" => "\xe1\xb8\xbd", + "\xe1\xb8\xbe" => "\xe1\xb8\xbf", + "\xe1\xb9\x80" => "\xe1\xb9\x81", + "\xe1\xb9\x82" => "\xe1\xb9\x83", + "\xe1\xb9\x84" => "\xe1\xb9\x85", + "\xe1\xb9\x86" => "\xe1\xb9\x87", + "\xe1\xb9\x88" => "\xe1\xb9\x89", + "\xe1\xb9\x8a" => "\xe1\xb9\x8b", + "\xe1\xb9\x8c" => "\xe1\xb9\x8d", + "\xe1\xb9\x8e" => "\xe1\xb9\x8f", + "\xe1\xb9\x90" => "\xe1\xb9\x91", + "\xe1\xb9\x92" => "\xe1\xb9\x93", + "\xe1\xb9\x94" => "\xe1\xb9\x95", + "\xe1\xb9\x96" => "\xe1\xb9\x97", + "\xe1\xb9\x98" => "\xe1\xb9\x99", + "\xe1\xb9\x9a" => "\xe1\xb9\x9b", + "\xe1\xb9\x9c" => "\xe1\xb9\x9d", + "\xe1\xb9\x9e" => "\xe1\xb9\x9f", + "\xe1\xb9\xa0" => "\xe1\xb9\xa1", + "\xe1\xb9\xa2" => "\xe1\xb9\xa3", + "\xe1\xb9\xa4" => "\xe1\xb9\xa5", + "\xe1\xb9\xa6" => "\xe1\xb9\xa7", + "\xe1\xb9\xa8" => "\xe1\xb9\xa9", + "\xe1\xb9\xaa" => "\xe1\xb9\xab", + "\xe1\xb9\xac" => "\xe1\xb9\xad", + "\xe1\xb9\xae" => "\xe1\xb9\xaf", + "\xe1\xb9\xb0" => "\xe1\xb9\xb1", + "\xe1\xb9\xb2" => "\xe1\xb9\xb3", + "\xe1\xb9\xb4" => "\xe1\xb9\xb5", + "\xe1\xb9\xb6" => "\xe1\xb9\xb7", + "\xe1\xb9\xb8" => "\xe1\xb9\xb9", + "\xe1\xb9\xba" => "\xe1\xb9\xbb", + "\xe1\xb9\xbc" => "\xe1\xb9\xbd", + "\xe1\xb9\xbe" => "\xe1\xb9\xbf", + "\xe1\xba\x80" => "\xe1\xba\x81", + "\xe1\xba\x82" => "\xe1\xba\x83", + "\xe1\xba\x84" => "\xe1\xba\x85", + "\xe1\xba\x86" => "\xe1\xba\x87", + "\xe1\xba\x88" => "\xe1\xba\x89", + "\xe1\xba\x8a" => "\xe1\xba\x8b", + "\xe1\xba\x8c" => "\xe1\xba\x8d", + "\xe1\xba\x8e" => "\xe1\xba\x8f", + "\xe1\xba\x90" => "\xe1\xba\x91", + "\xe1\xba\x92" => "\xe1\xba\x93", + "\xe1\xba\x94" => "\xe1\xba\x95", + "\xe1\xba\xa0" => "\xe1\xba\xa1", + "\xe1\xba\xa2" => "\xe1\xba\xa3", + "\xe1\xba\xa4" => "\xe1\xba\xa5", + "\xe1\xba\xa6" => "\xe1\xba\xa7", + "\xe1\xba\xa8" => "\xe1\xba\xa9", + "\xe1\xba\xaa" => "\xe1\xba\xab", + "\xe1\xba\xac" => "\xe1\xba\xad", + "\xe1\xba\xae" => "\xe1\xba\xaf", + "\xe1\xba\xb0" => "\xe1\xba\xb1", + "\xe1\xba\xb2" => "\xe1\xba\xb3", + "\xe1\xba\xb4" => "\xe1\xba\xb5", + "\xe1\xba\xb6" => "\xe1\xba\xb7", + "\xe1\xba\xb8" => "\xe1\xba\xb9", + "\xe1\xba\xba" => "\xe1\xba\xbb", + "\xe1\xba\xbc" => "\xe1\xba\xbd", + "\xe1\xba\xbe" => "\xe1\xba\xbf", + "\xe1\xbb\x80" => "\xe1\xbb\x81", + "\xe1\xbb\x82" => "\xe1\xbb\x83", + "\xe1\xbb\x84" => "\xe1\xbb\x85", + "\xe1\xbb\x86" => "\xe1\xbb\x87", + "\xe1\xbb\x88" => "\xe1\xbb\x89", + "\xe1\xbb\x8a" => "\xe1\xbb\x8b", + "\xe1\xbb\x8c" => "\xe1\xbb\x8d", + "\xe1\xbb\x8e" => "\xe1\xbb\x8f", + "\xe1\xbb\x90" => "\xe1\xbb\x91", + "\xe1\xbb\x92" => "\xe1\xbb\x93", + "\xe1\xbb\x94" => "\xe1\xbb\x95", + "\xe1\xbb\x96" => "\xe1\xbb\x97", + "\xe1\xbb\x98" => "\xe1\xbb\x99", + "\xe1\xbb\x9a" => "\xe1\xbb\x9b", + "\xe1\xbb\x9c" => "\xe1\xbb\x9d", + "\xe1\xbb\x9e" => "\xe1\xbb\x9f", + "\xe1\xbb\xa0" => "\xe1\xbb\xa1", + "\xe1\xbb\xa2" => "\xe1\xbb\xa3", + "\xe1\xbb\xa4" => "\xe1\xbb\xa5", + "\xe1\xbb\xa6" => "\xe1\xbb\xa7", + "\xe1\xbb\xa8" => "\xe1\xbb\xa9", + "\xe1\xbb\xaa" => "\xe1\xbb\xab", + "\xe1\xbb\xac" => "\xe1\xbb\xad", + "\xe1\xbb\xae" => "\xe1\xbb\xaf", + "\xe1\xbb\xb0" => "\xe1\xbb\xb1", + "\xe1\xbb\xb2" => "\xe1\xbb\xb3", + "\xe1\xbb\xb4" => "\xe1\xbb\xb5", + "\xe1\xbb\xb6" => "\xe1\xbb\xb7", + "\xe1\xbb\xb8" => "\xe1\xbb\xb9", + "\xe1\xbc\x88" => "\xe1\xbc\x80", + "\xe1\xbc\x89" => "\xe1\xbc\x81", + "\xe1\xbc\x8a" => "\xe1\xbc\x82", + "\xe1\xbc\x8b" => "\xe1\xbc\x83", + "\xe1\xbc\x8c" => "\xe1\xbc\x84", + "\xe1\xbc\x8d" => "\xe1\xbc\x85", + "\xe1\xbc\x8e" => "\xe1\xbc\x86", + "\xe1\xbc\x8f" => "\xe1\xbc\x87", + "\xe1\xbc\x98" => "\xe1\xbc\x90", + "\xe1\xbc\x99" => "\xe1\xbc\x91", + "\xe1\xbc\x9a" => "\xe1\xbc\x92", + "\xe1\xbc\x9b" => "\xe1\xbc\x93", + "\xe1\xbc\x9c" => "\xe1\xbc\x94", + "\xe1\xbc\x9d" => "\xe1\xbc\x95", + "\xe1\xbc\xa9" => "\xe1\xbc\xa1", + "\xe1\xbc\xaa" => "\xe1\xbc\xa2", + "\xe1\xbc\xab" => "\xe1\xbc\xa3", + "\xe1\xbc\xac" => "\xe1\xbc\xa4", + "\xe1\xbc\xad" => "\xe1\xbc\xa5", + "\xe1\xbc\xae" => "\xe1\xbc\xa6", + "\xe1\xbc\xaf" => "\xe1\xbc\xa7", + "\xe1\xbc\xb8" => "\xe1\xbc\xb0", + "\xe1\xbc\xb9" => "\xe1\xbc\xb1", + "\xe1\xbc\xba" => "\xe1\xbc\xb2", + "\xe1\xbc\xbb" => "\xe1\xbc\xb3", + "\xe1\xbc\xbc" => "\xe1\xbc\xb4", + "\xe1\xbc\xbd" => "\xe1\xbc\xb5", + "\xe1\xbc\xbe" => "\xe1\xbc\xb6", + "\xe1\xbc\xbf" => "\xe1\xbc\xb7", + "\xe1\xbd\x88" => "\xe1\xbd\x80", + "\xe1\xbd\x89" => "\xe1\xbd\x81", + "\xe1\xbd\x8a" => "\xe1\xbd\x82", + "\xe1\xbd\x8b" => "\xe1\xbd\x83", + "\xe1\xbd\x8c" => "\xe1\xbd\x84", + "\xe1\xbd\x8d" => "\xe1\xbd\x85", + "\xe1\xbd\x99" => "\xe1\xbd\x91", + "\xe1\xbd\x9b" => "\xe1\xbd\x93", + "\xe1\xbd\x9d" => "\xe1\xbd\x95", + "\xe1\xbd\x9f" => "\xe1\xbd\x97", + "\xe1\xbd\xa9" => "\xe1\xbd\xa1", + "\xe1\xbd\xaa" => "\xe1\xbd\xa2", + "\xe1\xbd\xab" => "\xe1\xbd\xa3", + "\xe1\xbd\xac" => "\xe1\xbd\xa4", + "\xe1\xbd\xad" => "\xe1\xbd\xa5", + "\xe1\xbd\xae" => "\xe1\xbd\xa6", + "\xe1\xbd\xaf" => "\xe1\xbd\xa7", + "\xe1\xbe\x88" => "\xe1\xbe\x80", + "\xe1\xbe\x89" => "\xe1\xbe\x81", + "\xe1\xbe\x8a" => "\xe1\xbe\x82", + "\xe1\xbe\x8b" => "\xe1\xbe\x83", + "\xe1\xbe\x8c" => "\xe1\xbe\x84", + "\xe1\xbe\x8d" => "\xe1\xbe\x85", + "\xe1\xbe\x8e" => "\xe1\xbe\x86", + "\xe1\xbe\x8f" => "\xe1\xbe\x87", + "\xe1\xbe\x98" => "\xe1\xbe\x90", + "\xe1\xbe\x99" => "\xe1\xbe\x91", + "\xe1\xbe\x9a" => "\xe1\xbe\x92", + "\xe1\xbe\x9b" => "\xe1\xbe\x93", + "\xe1\xbe\x9c" => "\xe1\xbe\x94", + "\xe1\xbe\x9d" => "\xe1\xbe\x95", + "\xe1\xbe\x9e" => "\xe1\xbe\x96", + "\xe1\xbe\x9f" => "\xe1\xbe\x97", + "\xe1\xbe\xa9" => "\xe1\xbe\xa1", + "\xe1\xbe\xaa" => "\xe1\xbe\xa2", + "\xe1\xbe\xab" => "\xe1\xbe\xa3", + "\xe1\xbe\xac" => "\xe1\xbe\xa4", + "\xe1\xbe\xad" => "\xe1\xbe\xa5", + "\xe1\xbe\xae" => "\xe1\xbe\xa6", + "\xe1\xbe\xaf" => "\xe1\xbe\xa7", + "\xe1\xbe\xb8" => "\xe1\xbe\xb0", + "\xe1\xbe\xb9" => "\xe1\xbe\xb1", + "\xe1\xbe\xba" => "\xe1\xbd\xb0", + "\xe1\xbe\xbb" => "\xe1\xbd\xb1", + "\xe1\xbe\xbc" => "\xe1\xbe\xb3", + "\xe1\xbf\x88" => "\xe1\xbd\xb2", + "\xe1\xbf\x89" => "\xe1\xbd\xb3", + "\xe1\xbf\x8a" => "\xe1\xbd\xb4", + "\xe1\xbf\x8b" => "\xe1\xbd\xb5", + "\xe1\xbf\x8c" => "\xe1\xbf\x83", + "\xe1\xbf\x98" => "\xe1\xbf\x90", + "\xe1\xbf\x99" => "\xe1\xbf\x91", + "\xe1\xbf\x9a" => "\xe1\xbd\xb6", + "\xe1\xbf\x9b" => "\xe1\xbd\xb7", + "\xe1\xbf\xa9" => "\xe1\xbf\xa1", + "\xe1\xbf\xaa" => "\xe1\xbd\xba", + "\xe1\xbf\xab" => "\xe1\xbd\xbb", + "\xe1\xbf\xac" => "\xe1\xbf\xa5", + "\xe1\xbf\xb8" => "\xe1\xbd\xb8", + "\xe1\xbf\xb9" => "\xe1\xbd\xb9", + "\xe1\xbf\xba" => "\xe1\xbd\xbc", + "\xe1\xbf\xbb" => "\xe1\xbd\xbd", + "\xe1\xbf\xbc" => "\xe1\xbf\xb3", + "\xef\xbc\xa1" => "\xef\xbd\x81", + "\xef\xbc\xa2" => "\xef\xbd\x82", + "\xef\xbc\xa3" => "\xef\xbd\x83", + "\xef\xbc\xa4" => "\xef\xbd\x84", + "\xef\xbc\xa5" => "\xef\xbd\x85", + "\xef\xbc\xa6" => "\xef\xbd\x86", + "\xef\xbc\xa7" => "\xef\xbd\x87", + "\xef\xbc\xa8" => "\xef\xbd\x88", + "\xef\xbc\xa9" => "\xef\xbd\x89", + "\xef\xbc\xaa" => "\xef\xbd\x8a", + "\xef\xbc\xab" => "\xef\xbd\x8b", + "\xef\xbc\xac" => "\xef\xbd\x8c", + "\xef\xbc\xad" => "\xef\xbd\x8d", + "\xef\xbc\xae" => "\xef\xbd\x8e", + "\xef\xbc\xaf" => "\xef\xbd\x8f", + "\xef\xbc\xb0" => "\xef\xbd\x90", + "\xef\xbc\xb1" => "\xef\xbd\x91", + "\xef\xbc\xb2" => "\xef\xbd\x92", + "\xef\xbc\xb3" => "\xef\xbd\x93", + "\xef\xbc\xb4" => "\xef\xbd\x94", + "\xef\xbc\xb5" => "\xef\xbd\x95", + "\xef\xbc\xb6" => "\xef\xbd\x96", + "\xef\xbc\xb7" => "\xef\xbd\x97", + "\xef\xbc\xb8" => "\xef\xbd\x98", + "\xef\xbc\xb9" => "\xef\xbd\x99", + "\xef\xbc\xba" => "\xef\xbd\x9a", + ); + + #Unicode Character Database 6.0.0 (2010-06-04) + #autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total + public static $unicode_blocks = array( + 'Basic Latin' => array( + 0 => 0x0000, + 1 => 0x007F, + 2 => 0, + ), + 'Latin-1 Supplement' => array( + 0 => 0x0080, + 1 => 0x00FF, + 2 => 1, + ), + 'Latin Extended-A' => array( + 0 => 0x0100, + 1 => 0x017F, + 2 => 2, + ), + 'Latin Extended-B' => array( + 0 => 0x0180, + 1 => 0x024F, + 2 => 3, + ), + 'IPA Extensions' => array( + 0 => 0x0250, + 1 => 0x02AF, + 2 => 4, + ), + 'Spacing Modifier Letters' => array( + 0 => 0x02B0, + 1 => 0x02FF, + 2 => 5, + ), + 'Combining Diacritical Marks' => array( + 0 => 0x0300, + 1 => 0x036F, + 2 => 6, + ), + 'Greek and Coptic' => array( + 0 => 0x0370, + 1 => 0x03FF, + 2 => 7, + ), + 'Cyrillic' => array( + 0 => 0x0400, + 1 => 0x04FF, + 2 => 8, + ), + 'Cyrillic Supplement' => array( + 0 => 0x0500, + 1 => 0x052F, + 2 => 9, + ), + 'Armenian' => array( + 0 => 0x0530, + 1 => 0x058F, + 2 => 10, + ), + 'Hebrew' => array( + 0 => 0x0590, + 1 => 0x05FF, + 2 => 11, + ), + 'Arabic' => array( + 0 => 0x0600, + 1 => 0x06FF, + 2 => 12, + ), + 'Syriac' => array( + 0 => 0x0700, + 1 => 0x074F, + 2 => 13, + ), + 'Arabic Supplement' => array( + 0 => 0x0750, + 1 => 0x077F, + 2 => 14, + ), + 'Thaana' => array( + 0 => 0x0780, + 1 => 0x07BF, + 2 => 15, + ), + 'NKo' => array( + 0 => 0x07C0, + 1 => 0x07FF, + 2 => 16, + ), + 'Samaritan' => array( + 0 => 0x0800, + 1 => 0x083F, + 2 => 17, + ), + 'Mandaic' => array( + 0 => 0x0840, + 1 => 0x085F, + 2 => 18, + ), + 'Devanagari' => array( + 0 => 0x0900, + 1 => 0x097F, + 2 => 19, + ), + 'Bengali' => array( + 0 => 0x0980, + 1 => 0x09FF, + 2 => 20, + ), + 'Gurmukhi' => array( + 0 => 0x0A00, + 1 => 0x0A7F, + 2 => 21, + ), + 'Gujarati' => array( + 0 => 0x0A80, + 1 => 0x0AFF, + 2 => 22, + ), + 'Oriya' => array( + 0 => 0x0B00, + 1 => 0x0B7F, + 2 => 23, + ), + 'Tamil' => array( + 0 => 0x0B80, + 1 => 0x0BFF, + 2 => 24, + ), + 'Telugu' => array( + 0 => 0x0C00, + 1 => 0x0C7F, + 2 => 25, + ), + 'Kannada' => array( + 0 => 0x0C80, + 1 => 0x0CFF, + 2 => 26, + ), + 'Malayalam' => array( + 0 => 0x0D00, + 1 => 0x0D7F, + 2 => 27, + ), + 'Sinhala' => array( + 0 => 0x0D80, + 1 => 0x0DFF, + 2 => 28, + ), + 'Thai' => array( + 0 => 0x0E00, + 1 => 0x0E7F, + 2 => 29, + ), + 'Lao' => array( + 0 => 0x0E80, + 1 => 0x0EFF, + 2 => 30, + ), + 'Tibetan' => array( + 0 => 0x0F00, + 1 => 0x0FFF, + 2 => 31, + ), + 'Myanmar' => array( + 0 => 0x1000, + 1 => 0x109F, + 2 => 32, + ), + 'Georgian' => array( + 0 => 0x10A0, + 1 => 0x10FF, + 2 => 33, + ), + 'Hangul Jamo' => array( + 0 => 0x1100, + 1 => 0x11FF, + 2 => 34, + ), + 'Ethiopic' => array( + 0 => 0x1200, + 1 => 0x137F, + 2 => 35, + ), + 'Ethiopic Supplement' => array( + 0 => 0x1380, + 1 => 0x139F, + 2 => 36, + ), + 'Cherokee' => array( + 0 => 0x13A0, + 1 => 0x13FF, + 2 => 37, + ), + 'Unified Canadian Aboriginal Syllabics' => array( + 0 => 0x1400, + 1 => 0x167F, + 2 => 38, + ), + 'Ogham' => array( + 0 => 0x1680, + 1 => 0x169F, + 2 => 39, + ), + 'Runic' => array( + 0 => 0x16A0, + 1 => 0x16FF, + 2 => 40, + ), + 'Tagalog' => array( + 0 => 0x1700, + 1 => 0x171F, + 2 => 41, + ), + 'Hanunoo' => array( + 0 => 0x1720, + 1 => 0x173F, + 2 => 42, + ), + 'Buhid' => array( + 0 => 0x1740, + 1 => 0x175F, + 2 => 43, + ), + 'Tagbanwa' => array( + 0 => 0x1760, + 1 => 0x177F, + 2 => 44, + ), + 'Khmer' => array( + 0 => 0x1780, + 1 => 0x17FF, + 2 => 45, + ), + 'Mongolian' => array( + 0 => 0x1800, + 1 => 0x18AF, + 2 => 46, + ), + 'Unified Canadian Aboriginal Syllabics Extended' => array( + 0 => 0x18B0, + 1 => 0x18FF, + 2 => 47, + ), + 'Limbu' => array( + 0 => 0x1900, + 1 => 0x194F, + 2 => 48, + ), + 'Tai Le' => array( + 0 => 0x1950, + 1 => 0x197F, + 2 => 49, + ), + 'New Tai Lue' => array( + 0 => 0x1980, + 1 => 0x19DF, + 2 => 50, + ), + 'Khmer Symbols' => array( + 0 => 0x19E0, + 1 => 0x19FF, + 2 => 51, + ), + 'Buginese' => array( + 0 => 0x1A00, + 1 => 0x1A1F, + 2 => 52, + ), + 'Tai Tham' => array( + 0 => 0x1A20, + 1 => 0x1AAF, + 2 => 53, + ), + 'Balinese' => array( + 0 => 0x1B00, + 1 => 0x1B7F, + 2 => 54, + ), + 'Sundanese' => array( + 0 => 0x1B80, + 1 => 0x1BBF, + 2 => 55, + ), + 'Batak' => array( + 0 => 0x1BC0, + 1 => 0x1BFF, + 2 => 56, + ), + 'Lepcha' => array( + 0 => 0x1C00, + 1 => 0x1C4F, + 2 => 57, + ), + 'Ol Chiki' => array( + 0 => 0x1C50, + 1 => 0x1C7F, + 2 => 58, + ), + 'Vedic Extensions' => array( + 0 => 0x1CD0, + 1 => 0x1CFF, + 2 => 59, + ), + 'Phonetic Extensions' => array( + 0 => 0x1D00, + 1 => 0x1D7F, + 2 => 60, + ), + 'Phonetic Extensions Supplement' => array( + 0 => 0x1D80, + 1 => 0x1DBF, + 2 => 61, + ), + 'Combining Diacritical Marks Supplement' => array( + 0 => 0x1DC0, + 1 => 0x1DFF, + 2 => 62, + ), + 'Latin Extended Additional' => array( + 0 => 0x1E00, + 1 => 0x1EFF, + 2 => 63, + ), + 'Greek Extended' => array( + 0 => 0x1F00, + 1 => 0x1FFF, + 2 => 64, + ), + 'General Punctuation' => array( + 0 => 0x2000, + 1 => 0x206F, + 2 => 65, + ), + 'Superscripts and Subscripts' => array( + 0 => 0x2070, + 1 => 0x209F, + 2 => 66, + ), + 'Currency Symbols' => array( + 0 => 0x20A0, + 1 => 0x20CF, + 2 => 67, + ), + 'Combining Diacritical Marks for Symbols' => array( + 0 => 0x20D0, + 1 => 0x20FF, + 2 => 68, + ), + 'Letterlike Symbols' => array( + 0 => 0x2100, + 1 => 0x214F, + 2 => 69, + ), + 'Number Forms' => array( + 0 => 0x2150, + 1 => 0x218F, + 2 => 70, + ), + 'Arrows' => array( + 0 => 0x2190, + 1 => 0x21FF, + 2 => 71, + ), + 'Mathematical Operators' => array( + 0 => 0x2200, + 1 => 0x22FF, + 2 => 72, + ), + 'Miscellaneous Technical' => array( + 0 => 0x2300, + 1 => 0x23FF, + 2 => 73, + ), + 'Control Pictures' => array( + 0 => 0x2400, + 1 => 0x243F, + 2 => 74, + ), + 'Optical Character Recognition' => array( + 0 => 0x2440, + 1 => 0x245F, + 2 => 75, + ), + 'Enclosed Alphanumerics' => array( + 0 => 0x2460, + 1 => 0x24FF, + 2 => 76, + ), + 'Box Drawing' => array( + 0 => 0x2500, + 1 => 0x257F, + 2 => 77, + ), + 'Block Elements' => array( + 0 => 0x2580, + 1 => 0x259F, + 2 => 78, + ), + 'Geometric Shapes' => array( + 0 => 0x25A0, + 1 => 0x25FF, + 2 => 79, + ), + 'Miscellaneous Symbols' => array( + 0 => 0x2600, + 1 => 0x26FF, + 2 => 80, + ), + 'Dingbats' => array( + 0 => 0x2700, + 1 => 0x27BF, + 2 => 81, + ), + 'Miscellaneous Mathematical Symbols-A' => array( + 0 => 0x27C0, + 1 => 0x27EF, + 2 => 82, + ), + 'Supplemental Arrows-A' => array( + 0 => 0x27F0, + 1 => 0x27FF, + 2 => 83, + ), + 'Braille Patterns' => array( + 0 => 0x2800, + 1 => 0x28FF, + 2 => 84, + ), + 'Supplemental Arrows-B' => array( + 0 => 0x2900, + 1 => 0x297F, + 2 => 85, + ), + 'Miscellaneous Mathematical Symbols-B' => array( + 0 => 0x2980, + 1 => 0x29FF, + 2 => 86, + ), + 'Supplemental Mathematical Operators' => array( + 0 => 0x2A00, + 1 => 0x2AFF, + 2 => 87, + ), + 'Miscellaneous Symbols and Arrows' => array( + 0 => 0x2B00, + 1 => 0x2BFF, + 2 => 88, + ), + 'Glagolitic' => array( + 0 => 0x2C00, + 1 => 0x2C5F, + 2 => 89, + ), + 'Latin Extended-C' => array( + 0 => 0x2C60, + 1 => 0x2C7F, + 2 => 90, + ), + 'Coptic' => array( + 0 => 0x2C80, + 1 => 0x2CFF, + 2 => 91, + ), + 'Georgian Supplement' => array( + 0 => 0x2D00, + 1 => 0x2D2F, + 2 => 92, + ), + 'Tifinagh' => array( + 0 => 0x2D30, + 1 => 0x2D7F, + 2 => 93, + ), + 'Ethiopic Extended' => array( + 0 => 0x2D80, + 1 => 0x2DDF, + 2 => 94, + ), + 'Cyrillic Extended-A' => array( + 0 => 0x2DE0, + 1 => 0x2DFF, + 2 => 95, + ), + 'Supplemental Punctuation' => array( + 0 => 0x2E00, + 1 => 0x2E7F, + 2 => 96, + ), + 'CJK Radicals Supplement' => array( + 0 => 0x2E80, + 1 => 0x2EFF, + 2 => 97, + ), + 'Kangxi Radicals' => array( + 0 => 0x2F00, + 1 => 0x2FDF, + 2 => 98, + ), + 'Ideographic Description Characters' => array( + 0 => 0x2FF0, + 1 => 0x2FFF, + 2 => 99, + ), + 'CJK Symbols and Punctuation' => array( + 0 => 0x3000, + 1 => 0x303F, + 2 => 100, + ), + 'Hiragana' => array( + 0 => 0x3040, + 1 => 0x309F, + 2 => 101, + ), + 'Katakana' => array( + 0 => 0x30A0, + 1 => 0x30FF, + 2 => 102, + ), + 'Bopomofo' => array( + 0 => 0x3100, + 1 => 0x312F, + 2 => 103, + ), + 'Hangul Compatibility Jamo' => array( + 0 => 0x3130, + 1 => 0x318F, + 2 => 104, + ), + 'Kanbun' => array( + 0 => 0x3190, + 1 => 0x319F, + 2 => 105, + ), + 'Bopomofo Extended' => array( + 0 => 0x31A0, + 1 => 0x31BF, + 2 => 106, + ), + 'CJK Strokes' => array( + 0 => 0x31C0, + 1 => 0x31EF, + 2 => 107, + ), + 'Katakana Phonetic Extensions' => array( + 0 => 0x31F0, + 1 => 0x31FF, + 2 => 108, + ), + 'Enclosed CJK Letters and Months' => array( + 0 => 0x3200, + 1 => 0x32FF, + 2 => 109, + ), + 'CJK Compatibility' => array( + 0 => 0x3300, + 1 => 0x33FF, + 2 => 110, + ), + 'CJK Unified Ideographs Extension A' => array( + 0 => 0x3400, + 1 => 0x4DBF, + 2 => 111, + ), + 'Yijing Hexagram Symbols' => array( + 0 => 0x4DC0, + 1 => 0x4DFF, + 2 => 112, + ), + 'CJK Unified Ideographs' => array( + 0 => 0x4E00, + 1 => 0x9FFF, + 2 => 113, + ), + 'Yi Syllables' => array( + 0 => 0xA000, + 1 => 0xA48F, + 2 => 114, + ), + 'Yi Radicals' => array( + 0 => 0xA490, + 1 => 0xA4CF, + 2 => 115, + ), + 'Lisu' => array( + 0 => 0xA4D0, + 1 => 0xA4FF, + 2 => 116, + ), + 'Vai' => array( + 0 => 0xA500, + 1 => 0xA63F, + 2 => 117, + ), + 'Cyrillic Extended-B' => array( + 0 => 0xA640, + 1 => 0xA69F, + 2 => 118, + ), + 'Bamum' => array( + 0 => 0xA6A0, + 1 => 0xA6FF, + 2 => 119, + ), + 'Modifier Tone Letters' => array( + 0 => 0xA700, + 1 => 0xA71F, + 2 => 120, + ), + 'Latin Extended-D' => array( + 0 => 0xA720, + 1 => 0xA7FF, + 2 => 121, + ), + 'Syloti Nagri' => array( + 0 => 0xA800, + 1 => 0xA82F, + 2 => 122, + ), + 'Common Indic Number Forms' => array( + 0 => 0xA830, + 1 => 0xA83F, + 2 => 123, + ), + 'Phags-pa' => array( + 0 => 0xA840, + 1 => 0xA87F, + 2 => 124, + ), + 'Saurashtra' => array( + 0 => 0xA880, + 1 => 0xA8DF, + 2 => 125, + ), + 'Devanagari Extended' => array( + 0 => 0xA8E0, + 1 => 0xA8FF, + 2 => 126, + ), + 'Kayah Li' => array( + 0 => 0xA900, + 1 => 0xA92F, + 2 => 127, + ), + 'Rejang' => array( + 0 => 0xA930, + 1 => 0xA95F, + 2 => 128, + ), + 'Hangul Jamo Extended-A' => array( + 0 => 0xA960, + 1 => 0xA97F, + 2 => 129, + ), + 'Javanese' => array( + 0 => 0xA980, + 1 => 0xA9DF, + 2 => 130, + ), + 'Cham' => array( + 0 => 0xAA00, + 1 => 0xAA5F, + 2 => 131, + ), + 'Myanmar Extended-A' => array( + 0 => 0xAA60, + 1 => 0xAA7F, + 2 => 132, + ), + 'Tai Viet' => array( + 0 => 0xAA80, + 1 => 0xAADF, + 2 => 133, + ), + 'Ethiopic Extended-A' => array( + 0 => 0xAB00, + 1 => 0xAB2F, + 2 => 134, + ), + 'Meetei Mayek' => array( + 0 => 0xABC0, + 1 => 0xABFF, + 2 => 135, + ), + 'Hangul Syllables' => array( + 0 => 0xAC00, + 1 => 0xD7AF, + 2 => 136, + ), + 'Hangul Jamo Extended-B' => array( + 0 => 0xD7B0, + 1 => 0xD7FF, + 2 => 137, + ), + 'High Surrogates' => array( + 0 => 0xD800, + 1 => 0xDB7F, + 2 => 138, + ), + 'High Private Use Surrogates' => array( + 0 => 0xDB80, + 1 => 0xDBFF, + 2 => 139, + ), + 'Low Surrogates' => array( + 0 => 0xDC00, + 1 => 0xDFFF, + 2 => 140, + ), + 'Private Use Area' => array( + 0 => 0xE000, + 1 => 0xF8FF, + 2 => 141, + ), + 'CJK Compatibility Ideographs' => array( + 0 => 0xF900, + 1 => 0xFAFF, + 2 => 142, + ), + 'Alphabetic Presentation Forms' => array( + 0 => 0xFB00, + 1 => 0xFB4F, + 2 => 143, + ), + 'Arabic Presentation Forms-A' => array( + 0 => 0xFB50, + 1 => 0xFDFF, + 2 => 144, + ), + 'Variation Selectors' => array( + 0 => 0xFE00, + 1 => 0xFE0F, + 2 => 145, + ), + 'Vertical Forms' => array( + 0 => 0xFE10, + 1 => 0xFE1F, + 2 => 146, + ), + 'Combining Half Marks' => array( + 0 => 0xFE20, + 1 => 0xFE2F, + 2 => 147, + ), + 'CJK Compatibility Forms' => array( + 0 => 0xFE30, + 1 => 0xFE4F, + 2 => 148, + ), + 'Small Form Variants' => array( + 0 => 0xFE50, + 1 => 0xFE6F, + 2 => 149, + ), + 'Arabic Presentation Forms-B' => array( + 0 => 0xFE70, + 1 => 0xFEFF, + 2 => 150, + ), + 'Halfwidth and Fullwidth Forms' => array( + 0 => 0xFF00, + 1 => 0xFFEF, + 2 => 151, + ), + 'Specials' => array( + 0 => 0xFFF0, + 1 => 0xFFFF, + 2 => 152, + ), + 'Linear B Syllabary' => array( + 0 => 0x10000, + 1 => 0x1007F, + 2 => 153, + ), + 'Linear B Ideograms' => array( + 0 => 0x10080, + 1 => 0x100FF, + 2 => 154, + ), + 'Aegean Numbers' => array( + 0 => 0x10100, + 1 => 0x1013F, + 2 => 155, + ), + 'Ancient Greek Numbers' => array( + 0 => 0x10140, + 1 => 0x1018F, + 2 => 156, + ), + 'Ancient Symbols' => array( + 0 => 0x10190, + 1 => 0x101CF, + 2 => 157, + ), + 'Phaistos Disc' => array( + 0 => 0x101D0, + 1 => 0x101FF, + 2 => 158, + ), + 'Lycian' => array( + 0 => 0x10280, + 1 => 0x1029F, + 2 => 159, + ), + 'Carian' => array( + 0 => 0x102A0, + 1 => 0x102DF, + 2 => 160, + ), + 'Old Italic' => array( + 0 => 0x10300, + 1 => 0x1032F, + 2 => 161, + ), + 'Gothic' => array( + 0 => 0x10330, + 1 => 0x1034F, + 2 => 162, + ), + 'Ugaritic' => array( + 0 => 0x10380, + 1 => 0x1039F, + 2 => 163, + ), + 'Old Persian' => array( + 0 => 0x103A0, + 1 => 0x103DF, + 2 => 164, + ), + 'Deseret' => array( + 0 => 0x10400, + 1 => 0x1044F, + 2 => 165, + ), + 'Shavian' => array( + 0 => 0x10450, + 1 => 0x1047F, + 2 => 166, + ), + 'Osmanya' => array( + 0 => 0x10480, + 1 => 0x104AF, + 2 => 167, + ), + 'Cypriot Syllabary' => array( + 0 => 0x10800, + 1 => 0x1083F, + 2 => 168, + ), + 'Imperial Aramaic' => array( + 0 => 0x10840, + 1 => 0x1085F, + 2 => 169, + ), + 'Phoenician' => array( + 0 => 0x10900, + 1 => 0x1091F, + 2 => 170, + ), + 'Lydian' => array( + 0 => 0x10920, + 1 => 0x1093F, + 2 => 171, + ), + 'Kharoshthi' => array( + 0 => 0x10A00, + 1 => 0x10A5F, + 2 => 172, + ), + 'Old South Arabian' => array( + 0 => 0x10A60, + 1 => 0x10A7F, + 2 => 173, + ), + 'Avestan' => array( + 0 => 0x10B00, + 1 => 0x10B3F, + 2 => 174, + ), + 'Inscriptional Parthian' => array( + 0 => 0x10B40, + 1 => 0x10B5F, + 2 => 175, + ), + 'Inscriptional Pahlavi' => array( + 0 => 0x10B60, + 1 => 0x10B7F, + 2 => 176, + ), + 'Old Turkic' => array( + 0 => 0x10C00, + 1 => 0x10C4F, + 2 => 177, + ), + 'Rumi Numeral Symbols' => array( + 0 => 0x10E60, + 1 => 0x10E7F, + 2 => 178, + ), + 'Brahmi' => array( + 0 => 0x11000, + 1 => 0x1107F, + 2 => 179, + ), + 'Kaithi' => array( + 0 => 0x11080, + 1 => 0x110CF, + 2 => 180, + ), + 'Cuneiform' => array( + 0 => 0x12000, + 1 => 0x123FF, + 2 => 181, + ), + 'Cuneiform Numbers and Punctuation' => array( + 0 => 0x12400, + 1 => 0x1247F, + 2 => 182, + ), + 'Egyptian Hieroglyphs' => array( + 0 => 0x13000, + 1 => 0x1342F, + 2 => 183, + ), + 'Bamum Supplement' => array( + 0 => 0x16800, + 1 => 0x16A3F, + 2 => 184, + ), + 'Kana Supplement' => array( + 0 => 0x1B000, + 1 => 0x1B0FF, + 2 => 185, + ), + 'Byzantine Musical Symbols' => array( + 0 => 0x1D000, + 1 => 0x1D0FF, + 2 => 186, + ), + 'Musical Symbols' => array( + 0 => 0x1D100, + 1 => 0x1D1FF, + 2 => 187, + ), + 'Ancient Greek Musical Notation' => array( + 0 => 0x1D200, + 1 => 0x1D24F, + 2 => 188, + ), + 'Tai Xuan Jing Symbols' => array( + 0 => 0x1D300, + 1 => 0x1D35F, + 2 => 189, + ), + 'Counting Rod Numerals' => array( + 0 => 0x1D360, + 1 => 0x1D37F, + 2 => 190, + ), + 'Mathematical Alphanumeric Symbols' => array( + 0 => 0x1D400, + 1 => 0x1D7FF, + 2 => 191, + ), + 'Mahjong Tiles' => array( + 0 => 0x1F000, + 1 => 0x1F02F, + 2 => 192, + ), + 'Domino Tiles' => array( + 0 => 0x1F030, + 1 => 0x1F09F, + 2 => 193, + ), + 'Playing Cards' => array( + 0 => 0x1F0A0, + 1 => 0x1F0FF, + 2 => 194, + ), + 'Enclosed Alphanumeric Supplement' => array( + 0 => 0x1F100, + 1 => 0x1F1FF, + 2 => 195, + ), + 'Enclosed Ideographic Supplement' => array( + 0 => 0x1F200, + 1 => 0x1F2FF, + 2 => 196, + ), + 'Miscellaneous Symbols And Pictographs' => array( + 0 => 0x1F300, + 1 => 0x1F5FF, + 2 => 197, + ), + 'Emoticons' => array( + 0 => 0x1F600, + 1 => 0x1F64F, + 2 => 198, + ), + 'Transport And Map Symbols' => array( + 0 => 0x1F680, + 1 => 0x1F6FF, + 2 => 199, + ), + 'Alchemical Symbols' => array( + 0 => 0x1F700, + 1 => 0x1F77F, + 2 => 200, + ), + 'CJK Unified Ideographs Extension B' => array( + 0 => 0x20000, + 1 => 0x2A6DF, + 2 => 201, + ), + 'CJK Unified Ideographs Extension C' => array( + 0 => 0x2A700, + 1 => 0x2B73F, + 2 => 202, + ), + 'CJK Unified Ideographs Extension D' => array( + 0 => 0x2B740, + 1 => 0x2B81F, + 2 => 203, + ), + 'CJK Compatibility Ideographs Supplement' => array( + 0 => 0x2F800, + 1 => 0x2FA1F, + 2 => 204, + ), + 'Tags' => array( + 0 => 0xE0000, + 1 => 0xE007F, + 2 => 205, + ), + 'Variation Selectors Supplement' => array( + 0 => 0xE0100, + 1 => 0xE01EF, + 2 => 206, + ), + 'Supplementary Private Use Area-A' => array( + 0 => 0xF0000, + 1 => 0xFFFFF, + 2 => 207, + ), + 'Supplementary Private Use Area-B' => array( + 0 => 0x100000, + 1 => 0x10FFFF, + 2 => 208, + ), + ); + + #calling the methods of this class only statically! + private function __construct() {} + + /** + * Remove combining diactrical marks, with possibility of the restore + * Удаляет диакритические знаки в тексте, с возможностью восстановления (опция) + * + * @param string|null $s + * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen + * @param bool $is_can_restored + * @param array|null &$restore_table + * @return string|bool|null Returns FALSE if error occurred + */ + public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if ($additional_chars) + { + foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/'); + $re = '/((?>' . self::$diactrical_re . '|' . implode('|', $additional_chars) . ')+)/sxSX'; + } + else $re = '/((?>' . self::$diactrical_re . ')+)/sxSX'; + if (! $is_can_restored) return preg_replace($re, '', $s); + + $restore_table = array(); + $a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE); + $c = count($a); + if ($c === 1) return $s; + $pos = 0; + $s2 = ''; + for ($i = 0; $i < $c - 1; $i += 2) + { + $s2 .= $a[$i]; + #запоминаем символьные (не байтовые!) позиции + $pos += self::strlen($a[$i]); + $restore_table['offsets'][$pos] = $a[$i + 1]; + } + $restore_table['length'] = $pos + self::strlen(end($a)); + return $s2 . end($a); + } + + /** + * Restore combining diactrical marks, removed by self::diactrical_remove() + * In Russian: + * Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились! + * + * @see self::diactrical_remove() + * @param string|null $s + * @param array $restore_table + * @return string|bool|null Returns FALSE if error occurred (broken $restore_table) + */ + public static function diactrical_restore($s, array $restore_table) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if (! $restore_table) return $s; + if (! is_int(@$restore_table['length']) || + ! is_array(@$restore_table['offsets']) || + $restore_table['length'] !== self::strlen($s)) return false; + $a = array(); + $length = $offset = 0; + $s2 = ''; + foreach ($restore_table['offsets'] as $pos => $diactricals) + { + $length = $pos - $offset; + $s2 .= self::substr($s, $offset, $length) . $diactricals; + $offset = $pos; + } + return $s2 . self::substr($s, $offset, strlen($s)); + } + + /** + * Encodes data from another character encoding to UTF-8. + * + * @param array|scalar|null $data + * @param string $charset + * @return array|scalar|null Returns FALSE if error occurred + */ + public static function convert_from($data, $charset = 'cp1251') + { + if (! ReflectionTypeHint::isValid()) return false; + return self::_convert($data, $charset, 'UTF-8'); + } + + /** + * Encodes data from UTF-8 to another character encoding. + * + * @param array|scalar|null $data + * @param string $charset + * @return array|scalar|null Returns FALSE if error occurred + */ + public static function convert_to($data, $charset = 'cp1251') + { + if (! ReflectionTypeHint::isValid()) return false; + return self::_convert($data, 'UTF-8', $charset); + } + + /** + * Recoding the data of any structure to/from UTF-8. + * Arrays traversed recursively, recoded keys and values. + * + * @see mb_encoding_aliases() + * @param array|scalar|null $data + * @param string $charset_from + * @param string $charset_to + * @return array|scalar|null Returns FALSE if error occurred + */ + private static function _convert($data, $charset_from, $charset_to) + { + if (! ReflectionTypeHint::isValid()) return false; #for recursive calls + if ($charset_from === $charset_to) return $data; + if (is_array($data)) + { + $d = array(); + foreach ($data as $k => &$v) + { + $k = self::_convert($k, $charset_from, $charset_to); + if ($k === false) return false; + $d[$k] = self::_convert($v, $charset_from, $charset_to); + if ($d[$k] === false && ! is_bool($v)) return false; + } + return $d; + } + if (is_string($data)) + { + #smart behaviour for errors protected + speed improve + if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data; + if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data; + + #since PHP-5.3.x iconv() faster then mb_convert_encoding() + if (function_exists('iconv')) return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data); + if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from); + + #charset_from + if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data); + if ($charset_from === 'cp1251' || $charset_from === 'cp1259') return strtr($data, self::$cp1259_table); + if ($charset_from === 'koi8-r' || $charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table); + if ($charset_from === 'iso8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table); + if ($charset_from === 'cp866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table); + if ($charset_from === 'mac-cyrillic') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table); + + #charset_to + if ($charset_to === 'cp1251' || $charset_to === 'cp1259') return strtr($data, array_flip(self::$cp1259_table)); + + #last trying + if (function_exists('recode_string')) + { + $s = @recode_string($charset_from . '..' . $charset_to, $data); + if (is_string($s)) return $s; + } + + trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING); + return false; + } + return $data; + } + + /** + * Convert UTF-16 / UCS-2 encoding string to UTF-8. + * Surrogates UTF-16 are supported! + * + * In Russian: + * Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8. + * Суррогаты UTF-16 поддерживаются! + * + * @param string $s + * @param string $type 'BE' -- big endian byte order + * 'LE' -- little endian byte order + * @param bool $to_array returns array chars instead whole string? + * @return string|array|bool UTF-8 string, array chars or FALSE if error occurred + */ + private static function _convert_from_utf16($s, $type = 'BE', $to_array = false) + { + static $types = array( + 'BE' => 'n', #unsigned short (always 16 bit, big endian byte order) + 'LE' => 'v', #unsigned short (always 16 bit, little endian byte order) + ); + if (! array_key_exists($type, $types)) + { + trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING); + return false; + } + #the fastest way: + if (function_exists('iconv') || function_exists('mb_convert_encoding')) + { + if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s); + elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type); + if (! $to_array) return $s; + return self::str_split($s); + } + + /* + http://en.wikipedia.org/wiki/UTF-16 + + The improvement that UTF-16 made over UCS-2 is its ability to encode + characters in planes 1-16, not just those in plane 0 (BMP). + + UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF) + using a pair of 16-bit words, known as a surrogate pair. + First 1000016 is subtracted from the code point to give a 20-bit value. + This is then split into two separate 10-bit values each of which is represented + as a surrogate with the most significant half placed in the first surrogate. + To allow safe use of simple word-oriented string processing, separate ranges + of values are used for the two surrogates: 0xD800-0xDBFF for the first, most + significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate. + + For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00, + and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD. + Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points + in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever + represent a character. + + http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm + http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm + + Conversion of a Unicode scalar value S to a surrogate pair : + H = Math.floor((S - 0x10000) / 0x400) + 0xD800; + L = ((S - 0x10000) % 0x400) + 0xDC00; + The conversion of a surrogate pair to a scalar value: + N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000; + */ + $a = array(); + $hi = false; + foreach (unpack($types[$type] . '*', $s) as $codepoint) + { + #surrogate process + if ($hi !== false) + { + $lo = $codepoint; + if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "\xEF\xBF\xBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char) + else + { + $codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000; + $a[] = self::chr($codepoint); + } + $hi = false; + } + elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate + else $hi = $codepoint; #surrogate was found + } + return $to_array ? $a : implode('', $a); + } + + /** + * Strips out device control codes in the ASCII range. + * + * @param string|null String to clean + * @return string|bool|null Returns FALSE if error occurred + */ + public static function strict($s) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $s); + } + + /** + * Check the data accessory to the class of characters ASCII. + * For null, integer, float, boolean returns TRUE. + * + * Массивы обходятся рекурсивно, если в хотябы одном элементе массива + * его значение не ASCII, возвращается FALSE. + * + * @param array|scalar|null $data + * @return bool + */ + public static function is_ascii($data) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_array($data)) + { + foreach ($data as $k => &$v) + { + if (! self::is_ascii($k) || ! self::is_ascii($v)) return false; + } + return true; + } + #ltrim() little faster then preg_match() + #if (is_string($data)) return preg_match('/^[\x00-\x7f]*$/sSX', $data); #deprecated + if (is_string($data)) return ltrim($data, "\x00..\x7f") === ''; + if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean + return false; #object or resource + } + + /** + * Returns true if data is valid UTF-8 and false otherwise. + * For null, integer, float, boolean returns TRUE. + * + * The arrays are traversed recursively, if At least one element of the array + * its value is not in UTF-8, returns FALSE. + * + * @link http://www.w3.org/International/questions/qa-forms-utf-8.html + * @link http://ru3.php.net/mb_detect_encoding + * @link http://webtest.philigon.ru/articles/utf8/ + * @link http://unicode.coeurlumiere.com/ + * @param array|scalar|null $data + * @param bool $is_strict strict the range of ASCII? + * @return bool + */ + public static function is_utf8($data, $is_strict = true) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_array($data)) + { + foreach ($data as $k => &$v) + { + if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false; + } + return true; + } + if (is_string($data)) + { + if (! preg_match('~~suSX', $data)) return false; + if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; + #preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')! + #if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED + if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) return false; + return true; + } + if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean + return false; #object or resource + } + + /** + * Tries to detect if a string is in Unicode encoding + * + * @deprecated Slowly, use self::is_utf8() instead + * @see self::is_utf8() + * @param string $s текст + * @param bool $is_strict строгая проверка диапазона ASCII? + * @return bool + */ + public static function check($s, $is_strict = true) + { + if (! ReflectionTypeHint::isValid()) return false; + for ($i = 0, $len = strlen($s); $i < $len; $i++) + { + $c = ord($s[$i]); + if ($c < 0x80) #1 byte 0bbbbbbb + { + if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) continue; + } + if (($c & 0xE0) == 0xC0) $n = 1; #2 bytes 110bbbbb 10bbbbbb + elseif (($c & 0xF0) == 0xE0) $n = 2; #3 bytes 1110bbbb 10bbbbbb 10bbbbbb + elseif (($c & 0xF8) == 0xF0) $n = 3; #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb + elseif (($c & 0xFC) == 0xF8) $n = 4; #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb + elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb + else return false; #does not match any model + #n bytes matching 10bbbbbb follow ? + for ($j = 0; $j < $n; $j++) + { + $i++; + if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false; + } + } + return true; + } + + /** + * Check the data in UTF-8 charset on given ranges of the standard UNICODE. + * The suitable alternative to regular expressions. + * + * For null, integer, float, boolean returns TRUE. + * + * Arrays traversed recursively (keys and values). + * At least if one array element value is not passed checking, it returns FALSE. + * + * @example + * #A simple check the standard named ranges: + * UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic')); + * #You can check the named, direct ranges or codepoints together: + * UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[\x20-\x7E] + * array(0x0410, 0x044F), #[A-Яa-я] + * 0x0401, #russian yo (Ё) + * 0x0451, #russian ye (ё) + * 'Arrows', + * )); + * + * @link http://www.unicode.org/charts/ + * @param array|scalar|null $data + * @param array|string $blocks + * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам + * и FALSE в противном случае или для разбитого UTF-8. + */ + public static function blocks_check($data, $blocks) + { + if (! ReflectionTypeHint::isValid()) return false; + + if (is_array($data)) + { + foreach ($data as $k => &$v) + { + if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false; + } + return true; + } + + if (is_string($data)) + { + $chars = self::str_split($data); + if ($chars === false) return false; #broken UTF-8 + unset($data); #memory free + $skip = array(); #save to cache already checked symbols + foreach ($chars as $i => $char) + { + if (array_key_exists($char, $skip)) continue; #speed improve + $codepoint = self::ord($char); + if ($codepoint === false) return false; #broken UTF-8 + $is_valid = false; + $blocks = (array)$blocks; + foreach ($blocks as $j => $block) + { + if (is_string($block)) + { + if (! array_key_exists($block, self::$unicode_blocks)) + { + trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING); + return false; + } + list ($min, $max) = self::$unicode_blocks[$block]; + } + elseif (is_array($block)) list ($min, $max) = $block; + elseif (is_int($block)) $min = $max = $block; + else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR); + if ($codepoint >= $min && $codepoint <= $max) + { + $is_valid = true; + break; + } + }#foreach + if (! $is_valid) return false; + $skip[$char] = null; + }#foreach + return true; + } + if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean + return false; #object or resource + } + + /** + * Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset encoding to UTF-8, if necessary. + * A side effect is a positive protection against XSS attacks with non-printable characters on the vulnerable PHP function. + * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8. + * For example: ?тест[тест]=тест + * + * Алгоритм работы: + * 1) Функция проверяет массивы $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES + * на корректность значений элементов кодировке UTF-8. + * 2) Значения не в UTF-8 принимаются как $charset и конвертируется в UTF-8, + * при этом байты от 0x00 до 0x7F (ASCII) сохраняются как есть. + * 3) Сконвертированные значения снова проверяются. + * Если данные опять не в кодировке UTF-8, то они считаются разбитыми и функция возвращает FALSE. + * + * NOTICE + * Функция должна вызываться после self::unescape_request()! + * + * @see self::unescape_request() + * @param bool $is_hex2bin Декодировать HEX-данные? + * Пример: 0xd09ec2a0d0bad0bed0bcd0bfd0b0d0bdd0b8d0b8 => О компании + * Параметры в URL адресах иногда бывает удобно кодировать не функцией rawurlencode(), + * а использовать следующий механизм (к тому же кодирующий данные более компактно): + * '0x' . bin2hex($string) + * @param string $charset + * @return bool Возвращает TRUE, если все значения элементов массивов в кодировке UTF-8 + * и FALSE + E_USER_WARNING в противном случае. + */ + public static function autoconvert_request($is_hex2bin = false, $charset = 'cp1251') + { + if (! ReflectionTypeHint::isValid()) return false; + $is_converted = false; + $is_broken = false; + foreach (array('_GET', '_POST', '_COOKIE', '_FILES') as $k => $v) + { + if (! array_key_exists($v, $GLOBALS)) continue; + #использовать array_walk_recursive() не предоставляется возможным, + #т.к. его callback функция не поддерживает передачу ключа по ссылке + $GLOBALS[$v] = self::_autoconvert_request_recursive($GLOBALS[$v], $is_converted, $is_broken, $is_hex2bin, $charset); + if ($is_broken) + { + trigger_error('Array $' . $v . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING); + return false; + } + } + if ($is_converted) + { + $_REQUEST = + (isset($_COOKIE) ? $_COOKIE : array()) + + (isset($_POST) ? $_POST : array()) + + (isset($_GET) ? $_GET : array()); + } + return true; + } + + private static function _autoconvert_request_recursive(&$data, &$is_converted, &$is_broken, $is_hex2bin, $charset) + { + if ($is_broken) return $data; #speed improve + if (is_array($data)) + { + $d = array(); + foreach ($data as $k => &$v) + { + $k = self::_autoconvert_request($k, $is_converted, $is_broken, $is_hex2bin, $charset); + if ($is_broken) return $data; #speed improve + $d[$k] = self::_autoconvert_request_recursive($v, $is_converted, $is_broken, $is_hex2bin, $charset); + if ($is_broken) return $data; #speed improve + } + return $d; + } + return self::_autoconvert_request($data, $is_converted, $is_broken, $is_hex2bin, $charset); + } + + private static function _autoconvert_request(&$s, &$is_converted, &$is_broken, $is_hex2bin, $charset) + { + #regexp speed improve by using strpos() + if ($is_hex2bin && strpos($s, '0x') === 0 && preg_match('/^0x((?:[\da-fA-F]{2})+)$/sSX', $s, $m)) + { + $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin() + $is_converted = true; + } + if (! self::is_utf8($s)) + { + $s = self::convert_from($s, $charset); + if ($s === false) $is_broken = true; + elseif (! self::is_utf8($s)) + { + trigger_error('String 0x ' . substr(bin2hex($s), 0, 100) . '... is not UTF-8!', E_USER_WARNING); + $is_broken = true; + } + else $is_converted = true; + } + return $s; + } + + /** + * Сравнение строк + * + * @param string|null $s1 + * @param string|null $s2 + * @param string $locale For example, 'en_CA', 'ru_RU' + * @return int|bool|null Returns FALSE if error occurred + * Returns < 0 if $s1 is less than $s2; + * > 0 if $s1 is greater than $s2; + * 0 if they are equal. + */ + public static function strcmp($s1, $s2, $locale = '') + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s1) || is_null($s2)) return null; + if (! function_exists('collator_create')) return strcmp($s1, $s2); + # PHP 5 >= 5.3.0, PECL intl >= 1.0.0 + # If empty string ("") or "root" are passed, UCA rules will be used. + $c = new Collator($locale); + if (! $c) + { + # Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened. + trigger_error(intl_get_error_message(), E_USER_WARNING); + return false; + } + return $c->compare($s1, $s2); + } + + /** + * Сравнение строк для N первых символов + * + * @param string|null $s1 + * @param string|null $s2 + * @param int $length + * @return int|bool|null Returns FALSE if error occurred + * Returns < 0 if $s1 is less than $s2; + * > 0 if $s1 is greater than $s2; + * 0 if they are equal. + */ + public static function strncmp($s1, $s2, $length) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s1) || is_null($s2)) return null; + return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length)); + } + + /** + * Implementation strcasecmp() function for UTF-8 encoding string. + * + * @param string|null $s1 + * @param string|null $s2 + * @return int|bool|null Returns FALSE if error occurred + * Returns < 0 if $s1 is less than $s2; + * > 0 if $s1 is greater than $s2; + * 0 if they are equal. + */ + public static function strcasecmp($s1, $s2) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s1) || is_null($s2)) return null; + return self::strcmp(self::lowercase($s1), self::lowercase($s2)); + } + + /** + * Converts a UTF-8 string to a UNICODE codepoints + * + * @param string|null $s UTF-8 string + * @return array|bool|null Unicode codepoints + * Returns FALSE if $s broken (not UTF-8) + */ + public static function to_unicode($s) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + $s2 = null; + #since PHP-5.3.x iconv() little faster then mb_convert_encoding() + if (function_exists('iconv')) $s2 = @iconv('UTF-8', 'UCS-4BE', $s); + elseif (function_exists('mb_convert_encoding')) $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8'); + if (is_string($s2)) return array_values(unpack('N*', $s2)); + if ($s2 !== null) return false; + + $a = self::str_split($s); + if ($a === false) return false; + return array_map(array(__CLASS__, 'ord'), $a); + } + + /** + * Converts a UNICODE codepoints to a UTF-8 string + * + * @param array|null $a Unicode codepoints + * @return string|bool|null UTF-8 string + * Returns FALSE if error occurred + */ + public static function from_unicode($a) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($a)) return $a; + + #since PHP-5.3.x iconv() little faster then mb_convert_encoding() + if (function_exists('iconv')) + { + array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); + $s = @iconv('UCS-4BE', 'UTF-8', implode('', $a)); + if (! is_string($s)) return false; + return $s; + } + if (function_exists('mb_convert_encoding')) + { + array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); + $s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE'); + if (! is_string($s)) return false; + return $s; + } + + return implode('', array_map(array(__CLASS__, 'chr'), $a)); + } + + /** + * Converts a UTF-8 character to a UNICODE codepoint + * + * @param string|null $char UTF-8 character + * @return int|bool|null Unicode codepoint + * Returns FALSE if $char broken (not UTF-8) + */ + public static function ord($char) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($char)) return $char; + + static $cache = array(); + if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve + + switch (strlen($char)) + { + case 1 : return $cache[$char] = ord($char); + case 2 : return $cache[$char] = (ord($char{1}) & 63) | + ((ord($char{0}) & 31) << 6); + case 3 : return $cache[$char] = (ord($char{2}) & 63) | + ((ord($char{1}) & 63) << 6) | + ((ord($char{0}) & 15) << 12); + case 4 : return $cache[$char] = (ord($char{3}) & 63) | + ((ord($char{2}) & 63) << 6) | + ((ord($char{1}) & 63) << 12) | + ((ord($char{0}) & 7) << 18); + default : + trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING); + return false; + } + } + + /** + * Converts a UNICODE codepoint to a UTF-8 character + * + * @param int|digit|null $cp Unicode codepoint + * @return string|bool|null UTF-8 character + * Returns FALSE if error occurred + */ + public static function chr($cp) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($cp)) return $cp; + + static $cache = array(); + if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve + + if ($cp <= 0x7f) return $cache[$cp] = chr($cp); + if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) . + chr(0x80 | ($cp & 0x3f)); + if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) . + chr(0x80 | (($cp >> 6) & 0x3f)) . + chr(0x80 | ($cp & 0x3f)); + if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) . + chr(0x80 | (($cp >> 12) & 0x3f)) . + chr(0x80 | (($cp >> 6) & 0x3f)) . + chr(0x80 | ($cp & 0x3f)); + #U+FFFD REPLACEMENT CHARACTER + return $cache[$cp] = "\xEF\xBF\xBD"; + } + + /** + * Implementation chunk_split() function for UTF-8 encoding string. + * + * @param string|null $s + * @param int|digit|null $length + * @param string|null $glue + * @return string|bool|null Returns FALSE if error occurred + */ + public static function chunk_split($s, $length = null, $glue = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + $length = intval($length); + $glue = strval($glue); + if ($length < 1) $length = 76; + if ($glue === '') $glue = "\r\n"; + if (! is_array($a = self::str_split($s, $length))) return false; + return implode($glue, $a); + } + + /** + * Changes all keys in an array + * + * @param array|null $a + * @param int $mode {CASE_LOWER|CASE_UPPER} + * @return array|bool|null Returns FALSE if error occurred + */ + public static function array_change_key_case($a, $mode) + { + if (! ReflectionTypeHint::isValid()) return false; + if (! is_array($a)) return $a; + $a2 = array(); + foreach ($a as $k => $v) + { + if (is_string($k)) + { + $k = self::convert_case($k, $mode); + if ($k === false) return false; + } + $a2[$k] = $v; + } + return $a2; + } + + /** + * Конвертирует регистр букв в данных в кодировке UTF-8. + * Массивы обходятся рекурсивно, при этом конвертируются только значения + * в элементах массива, а ключи остаются без изменений. + * Для конвертирования только ключей используйте метод self::array_change_key_case(). + * + * @see self::array_change_key_case() + * @link http://www.unicode.org/charts/PDF/U0400.pdf + * @link http://ru.wikipedia.org/wiki/ISO_639-1 + * @param array|scalar|null $data Данные произвольной структуры + * @param int $mode {CASE_LOWER|CASE_UPPER} + * @param bool $is_ascii_optimization for speed improve + * @return scalar|bool|null Returns FALSE if error occurred + */ + public static function convert_case($data, $mode, $is_ascii_optimization = true) + { + if (! ReflectionTypeHint::isValid()) return false; + + if (is_array($data)) + { + foreach ($data as $k => &$v) $v = self::convert_case($v, $mode); + return $data; + } + if (! is_string($data) || ! $data) return $data; + + if ($mode === CASE_UPPER) + { + if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve! + #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() + #if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8'); + return strtr($data, array_flip(self::$convert_case_table)); + } + if ($mode === CASE_LOWER) + { + if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve! + #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() + #if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8'); + return strtr($data, self::$convert_case_table); + } + trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING); + return $data; + } + + /** + * Convert a data to lower case + * + * @param array|scalar|null $data + * @return scalar|bool|null Returns FALSE if error occurred */ + public static function lowercase($data) + { + if (! ReflectionTypeHint::isValid()) return false; + return self::convert_case($data, CASE_LOWER); + } + + /** + * Convert a data to upper case + * + * @param array|scalar|null $data + * @return scalar|null Returns FALSE if error occurred + */ + public static function uppercase($data) + { + if (! ReflectionTypeHint::isValid()) return false; + return self::convert_case($data, CASE_UPPER); + } + + /** + * Convert a data to lower case + * + * @param array|scalar|null $data + * @return scalar|bool|null Returns FALSE if error occurred + */ + public static function strtolower($data) + { + if (! ReflectionTypeHint::isValid()) return false; + return self::convert_case($data, CASE_LOWER); + } + + /** + * Convert a data to upper case + * + * @param array|scalar|null $data + * @return scalar|null Returns FALSE if error occurred + */ + public static function strtoupper($data) + { + if (! ReflectionTypeHint::isValid()) return false; + return self::convert_case($data, CASE_UPPER); + } + + + /** + * Convert all HTML entities to native UTF-8 characters + * Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode() + * Все dec и hex сущности так же переводятся в UTF-8. + * + * Example: '"' or '"' or '"' will be converted to '"'. + * + * @link http://www.htmlhelp.com/reference/html40/entities/ + * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true + * + * @param scalar|null $s + * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & ") + * @return scalar|null Returns FALSE if error occurred + */ + public static function html_entity_decode($s, $is_special_chars = false) + { + if (! ReflectionTypeHint::isValid()) return false; + if (! is_string($s)) return $s; + + #speed improve + if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx; + || ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s; + + $table = self::$html_entity_table; + if ($is_special_chars) $table += self::$html_special_chars_table; + + #replace named entities + $s = strtr($s, $table); + #block below deprecated, since PHP-5.3.x strtr() 1.5 times faster + if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos)) + { + foreach (array_unique($m[0]) as $entity) + { + if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s); + } + } + + #заменяем числовые dec и hex сущности: + if (strpos($s, '&#') !== false) #speed improve + { + $class = __CLASS__; + $html_special_chars_table_flipped = array_flip(self::$html_special_chars_table); + $s = preg_replace_callback('/&#((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX', + function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars) + { + $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1]; + if (! $is_special_chars) + { + $char = pack('C', $codepoint); + if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char]; + } + return $class::chr($codepoint); + }, $s); + } + return $s; + } + + /** + * Convert special UTF-8 characters to HTML entities. + * Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities() + * + * @link http://www.htmlhelp.com/reference/html40/entities/ + * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true + * + * @param scalar|null $s + * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ") + * @return scalar|null Returns FALSE if error occurred + */ + public static function html_entity_encode($s, $is_special_chars_only = false) + { + if (! ReflectionTypeHint::isValid()) return false; + if (! is_string($s)) return $s; + + #if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table)); + if ($is_special_chars_only) return htmlspecialchars($s); + + #replace UTF-8 chars to named entities: + $s = strtr($s, array_flip(self::$html_entity_table)); + #block below deprecated, since PHP-5.3.x strtr() 3 times faster + if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes + | \xe2[\x80-\x99][\x82-\xac] #3 bytes + ) + ~sxSX', $s, $m)) + { + $table = array_flip(self::$html_entity_table); + foreach (array_unique($m[0]) as $char) + { + if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s); + } + } + + return $s; + } + + /** + * Make regular expression for case insensitive match + * Example (non ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]" + * Example (only ASCII): "123_test" => "(?i:123_test)" + * + * @param string $s + * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped. + * This is useful for escaping the delimiter that is required by the PCRE functions. + * The / is the most commonly used delimiter. + * @return string|bool|null Returns FALSE if error occurred + */ + public static function preg_quote_case_insensitive($s, $delimiter = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve + + $s_re = ''; + $s_lc = UTF8::lowercase($s); if ($s_lc === false) return false; + $s_uc = UTF8::uppercase($s); if ($s_uc === false) return false; + + $chars_lc = UTF8::str_split($s_lc); if ($chars_lc === false) return false; + $chars_uc = UTF8::str_split($s_uc); if ($chars_uc === false) return false; + + foreach ($chars_lc as $i => $char) + { + if ($chars_lc[$i] === $chars_uc[$i]) + $s_re .= preg_quote($chars_lc[$i], $delimiter); + elseif (self::is_ascii($chars_lc[$i])) + $s_re .= '[' . preg_quote($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']'; + else + $s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|' + . preg_quote($chars_uc[$i], $delimiter) . ')'; + } + return $s_re; + } + + /** + * Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag. + * This is regardless of whether you use /u modifier. + * + * @link http://bolknote.ru/2010/09/08/~2704 + * + * @param string $pattern + * @param string|null $subject + * @param array $matches + * @param int $flags + * @param int $char_offset + * @return array|bool|null Returns FALSE if error occurred + */ + public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($subject)) return null; + + $byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset; + + $return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset); + if ($return === false) return false; + + if ($flags & PREG_OFFSET_CAPTURE) + { + foreach ($matches as &$match) + { + foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1])); + } + } + + return $return; + } + + #alias for self::str_limit() + public static function truncate($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) + { + return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length); + } + + /** + * Обрезает текст в кодировке UTF-8 до заданной длины, + * причём последнее слово показывается целиком, а не обрывается на середине. + * Html сущности корректно обрабатываются. + * + * @param string|null $s Текст в кодировке UTF-8 + * @param int|null|digit $maxlength Ограничение длины текста + * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется + * @param bool|null &$is_cutted Текст был обрезан? + * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length, + * то текст возвращается без изменений + * @return string|bool|null Returns FALSE if error occurred + */ + public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "…" + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + $is_cutted = false; + if ($continue === null) $continue = "\xe2\x80\xa6"; + if (! $maxlength) $maxlength = 256; + + #speed improve block + #{{{ + if (strlen($s) <= $maxlength) return $s; + $s2 = str_replace("\r\n", '?', $s); + $s2 = preg_replace('/&(?> [a-zA-Z][a-zA-Z\d]+ + | \#(?> \d{1,4} + | x[\da-fA-F]{2,4} + ) + ); # html сущности (< > & ") + /sxSX', '?', $s2); + if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s; + #}}} + + $r = preg_match_all('/(?> \r\n # переносы строк + | &(?> [a-zA-Z][a-zA-Z\d]+ + | \#(?> \d{1,4} + | x[\da-fA-F]{2,4} + ) + ); # html сущности (< > & ") + | . + ) + /sxuSX', $s, $m); + if ($r === false) return false; + + #d($m); + if (count($m[0]) <= $maxlength) return $s; + + $left = implode('', array_slice($m[0], 0, $maxlength)); + #из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Z\d\(\{\[] и некоторые др. символы + #нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx; + $left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F"); + if (strlen($left) !== strlen($left2)) $return = $left2 . $continue; + else + { + #добавляем остаток к обрезанному слову + $right = implode('', array_slice($m[0], $maxlength)); + preg_match('/^(?> [\d\)\]\}\-\.:]+ #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80! + | \p{L}+ #буквы + | \xe2\x80\x9d #закрывающие кавычки + | \xe2\x80\x99 #закрывающие кавычки + | \xe2\x80\x9c #закрывающие кавычки + | \xc2\xbb #закрывающие кавычки + )+ + /suxSX', $right, $m); + #d($m); + $right = isset($m[0]) ? rtrim($m[0], '.-') : ''; + $return = $left . $right; + if (strlen($return) !== strlen($s)) $return .= $continue; + } + if (self::strlen($s) - self::strlen($return) < $tail_min_length) return $s; + + $is_cutted = true; + return $return; + } + + /** + * Implementation str_split() function for UTF-8 encoding string. + * + * @param string|null $s + * @param int|null|digit $length + * @return array|bool|null Returns FALSE if error occurred + */ + public static function str_split($s, $length = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + $length = ($length === null) ? 1 : intval($length); + if ($length < 1) return false; + #there are limits in regexp for {min,max}! + if (preg_match_all('~.~suSX', $s, $m) === false) return false; + if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; + if ($length === 1) $a = $m[0]; + else + { + $a = array(); + for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length)); + } + return $a; + } + + /** + * Implementation strlen() function for UTF-8 encoding string. + * + * @param string|null $s + * @return int|bool|null Returns FALSE if error occurred + */ + public static function strlen($s) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + //since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode()) + if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8'); + + /* + utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright. + It's much faster than iconv_strlen() + Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored + */ + return strlen(utf8_decode($s)); + + /* + #slowly then strlen(utf8_decode()) + if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8'); + + #Do not count UTF-8 continuation bytes + #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s)); + + #slowly then strlen(utf8_decode()) + preg_match_all('~.~suSX', $str, $m); + return count($m[0]); + + #slowly then preg_match_all() + count() + $n = 0; + for ($i = 0, $len = strlen($s); $i < $len; $i++) + { + $c = ord(substr($s, $i, 1)); + if ($c < 0x80) $n++; #single-byte (0xxxxxx) + elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx) + } + return $n; + */ + } + + /** + * Implementation strpos() function for UTF-8 encoding string + * + * @param string|null $s The entire string + * @param string|int $needle The searched substring + * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed + * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. + * If needle is not found, will return FALSE. + */ + public static function strpos($s, $needle, $offset = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if ($offset === null || $offset < 0) $offset = 0; + if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8'); + #iconv_strpos() deprecated, because slowly than self::strlen(substr()) + #if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8'); + $byte_pos = $offset; + do if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) return false; + while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset); + return $char_pos; + } + + /** + * Find position of first occurrence of a case-insensitive string. + * + * @param string|null $s The entire string + * @param string|int $needle The searched substring + * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed + * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. + * If needle is not found, will return FALSE. + */ + public static function stripos($s, $needle, $offset = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if ($offset === null || $offset < 0) $offset = 0; + if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8'); + + #optimization block (speed improve) + #{{{ + $ascii_int = intval(self::is_ascii($s)) + intval(self::is_ascii($needle)); + if ($ascii_int === 1) return false; + if ($ascii_int === 2) return stripos($s, $needle, $offset); + #}}} + + $s = self::convert_case($s, CASE_LOWER, false); + if ($s === false) return false; + $needle = self::convert_case($needle, CASE_LOWER, false); + if ($needle === false) return false; + return self::strpos($s, $needle, $offset); + } + + /** + * Implementation strrev() function for UTF-8 encoding string + * + * @param string|null $s + * @return string|bool|null Returns FALSE if error occurred + */ + public static function strrev($s) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if (0) #TODO test speed + { + $s = self::_convert($s, 'UTF-8', 'UTF-32'); + if (! is_string($s)) return false; + $s = implode('', array_reverse(str_split($s, 4))); + return self::_convert($s, 'UTF-32', 'UTF-8'); + } + + if (! is_array($a = self::str_split($s))) return false; + return implode('', array_reverse($a)); + } + + /** + * Implementation substr() function for UTF-8 encoding string. + * + * @link http://www.w3.org/International/questions/qa-forms-utf-8.html + * @param string|null $s + * @param int|digit $offset + * @param int|null|digit $length + * @return string|bool|null Returns FALSE if error occurred + */ + public static function substr($s, $offset, $length = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + #since PHP-5.3.x mb_substr() faster then iconv_substr() + if (function_exists('mb_substr')) + { + if ($length === null) $length = self::strlen($s); + return mb_substr($s, $offset, $length, 'utf-8'); + } + if (function_exists('iconv_substr')) + { + if ($length === null) $length = self::strlen($s); + return iconv_substr($s, $offset, $length, 'utf-8'); + } + + static $_s = null; + static $_a = null; + + if ($_s !== $s) $_a = self::str_split($_s = $s); + if (! is_array($_a)) return false; + if ($length !== null) $a = array_slice($_a, $offset, $length); + else $a = array_slice($_a, $offset); + return implode('', $a); + } + + /** + * Implementation substr_replace() function for UTF-8 encoding string. + * + * @param string|null $s + * @param string|int $replacement + * @param int|digit $start + * @param int|null $length + * @return string|bool|null Returns FALSE if error occurred + */ + public static function substr_replace($s, $replacement, $start, $length = null) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if (! is_array($a = self::str_split($s))) return false; + array_splice($a, $start, $length, $replacement); + return implode('', $a); + } + + /** + * Implementation ucfirst() function for UTF-8 encoding string. + * Преобразует первый символ строки в кодировке UTF-8 в верхний регистр. + * + * @param string|null $s + * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? + * @return string|bool|null Returns FALSE if error occurred + */ + public static function ucfirst($s, $is_other_to_lowercase = true) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + if ($s === '' || ! is_string($s)) return $s; + if (! preg_match('/^(.)(.*)$/suSX', $s, $m)) return false; + return self::uppercase($m[1]) . ($is_other_to_lowercase ? self::lowercase($m[2]) : $m[2]); + } + + /** + * Implementation ucwords() function for UTF-8 encoding string. + * Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8, + * остальные символы каждого слова преобразуются в нижний регистр. + * + * @param string|null $s + * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? + * @param string $spaces_re + * @return string|bool|null Returns FALSE if error occurred + */ + public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\pZ\s]+)~suSX') #\pXps is POSIX space: property Z or tab, NL, VT, FF, CR + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_null($s)) return $s; + + $words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); + foreach ($words as $k => $word) + { + $words[$k] = self::ucfirst($word, $is_other_to_lowercase = true); + if ($words[$k] === false) return false; + } + return implode('', $words); + } + + /** + * Decodes a string in the format %uXXXX or %u{XXXXXX} in the UTF-8 string. + * + * Используется для декодирования данных типа "%u0442%u0435%u0441%u0442", + * закодированных устаревшей функцией javascript://encode(). + * Рекомендуется использовать функцию javascript://encodeURIComponent(). + * + * NOTICE + * Устаревший формат %uXXXX позволяет использовать юникод только из диапазона UCS-2, т.е. от U+0 до U+FFFF + * + * @param scalar|array|null $data + * @param bool $is_rawurlencode + * @return scalar|array|null Returns FALSE if error occurred + */ + public static function unescape($data, $is_rawurlencode = false) + { + if (! ReflectionTypeHint::isValid()) return false; + if (is_array($data)) + { + $d = array(); + foreach ($data as $k => &$v) + { + $k = self::unescape($k, $is_rawurlencode); + if ($k === false) return false; + $d[$k] = self::unescape($v, $is_rawurlencode); + if ($d[$k] === false && ! is_bool($v)) return false; + } + return $d; + } + if (is_string($data)) + { + if (strpos($data, '%u') === false) return $data; #use strpos() for speed improving + return preg_replace_callback('/%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2 + | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts + ) + /sxSX', + function (array $m) use ($is_rawurlencode) + { + $codepoint = hexdec(trim($m[1], '{}')); + $char = self::chr($codepoint); + return $is_rawurlencode ? rawurlencode($char) : $char; + }, + $data); + } + if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean + return false; #object or resource + } + + /** + * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST + * decoded values ​​in the format %uXXXX and %u{XXXXXX}, encoded, + * for example, through an outdated javascript function escape(). + * Standard PHP5 cannot do it. + * 2) If in the HTTP_COOKIE there are parameters with the same name, + * takes the last value, not the first, as in the QUERY_STRING. + * 3) Creates an array of $_POST for non-standard Content-Type, for example, "Content-Type: application/octet-stream". + * Standard PHP5 creates an array for "Content-Type: application/x-www-form-urlencoded" and "Content-Type: multipart/form-data". + * + * Сессии, куки и независимая авторизация на поддоменах. + * + * ПРИМЕР 1 + * У рабочего сайта http://domain.com появились поддомены. + * Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com" + * В результате авторизация не работает. + * Помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична. + * Проблема в следующем: если в HTTP_COOKIE есть параметры с одинаковым именем, то берётся последнее значение, + * а не первое, как в QUERY_STRING. + * Более подробное описание: + * PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями. + * Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp" + * В этом случае сервер берёт первое значение, а не последнее. + * Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр. + * В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки: + * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com) + * "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены) + * Решение: поменять имя сессии. + * + * ПРИМЕР 2 + * Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка), + * http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2). + * Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены. + * Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com. + * Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE. + * Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com, + * для независимой авторизации нужно использовать разные имена сессий. + * Пример HTTP заголовков ответа сервера: + * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены) + * "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены) + * + * @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism + * @return void + */ + public static function unescape_request() + { + $fixed = false; + #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"! + $HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null; + if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA; + foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null, + '_POST' => $HTTP_RAW_POST_DATA, + '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null, + ) as $k => $v) + { + if (! is_string($v)) continue; + if ($k === '_COOKIE') + { + $v = preg_replace('/; *+/sSX', '&', $v); + unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING + } + if (strpos($v, '%u') !== false) + { + parse_str(self::unescape($v, $is_rawurlencode = true), $GLOBALS[$k]); + $fixed = true; + continue; + } + if (array_key_exists($k, $GLOBALS)) continue; + parse_str($v, $GLOBALS[$k]); + $fixed = true; + } + if ($fixed) + { + $_REQUEST = + (isset($_COOKIE) ? $_COOKIE : array()) + + (isset($_POST) ? $_POST : array()) + + (isset($_GET) ? $_GET : array()); + } + } + + /** + * Calculates the height of the edit text in