From 02bb08dcd23f57b22753c6a7576d0f4039e71b4e Mon Sep 17 00:00:00 2001 From: Yuriy Pikhtarev Date: Sun, 21 May 2017 15:33:29 +0300 Subject: [PATCH] UFT-8 autocorrection removal from standart package. --- library/config.php | 1 - library/includes/classes/correct.php | 3187 ---------------- library/includes/classes/reflection.php | 225 -- library/includes/classes/utf8.php | 4513 ----------------------- search.php | 11 +- tracker.php | 11 +- 6 files changed, 2 insertions(+), 7946 deletions(-) delete mode 100644 library/includes/classes/correct.php delete mode 100644 library/includes/classes/reflection.php delete mode 100644 library/includes/classes/utf8.php diff --git a/library/config.php b/library/config.php index 6a4d69089..d09595c8f 100644 --- a/library/config.php +++ b/library/config.php @@ -506,7 +506,6 @@ $bb_cfg['search_min_word_len'] = 3; $bb_cfg['search_max_word_len'] = 35; $bb_cfg['limit_max_search_results'] = false; $bb_cfg['spam_filter_file_path'] = ''; // BB_PATH .'/misc/spam_filter_words.txt'; -$bb_cfg['autocorrect_wkl'] = true; // autocorrect wrong keyboard layout // Posting $bb_cfg['prevent_multiposting'] = true; // replace "reply" with "edit last msg" if user (not admin or mod) is last topic poster diff --git a/library/includes/classes/correct.php b/library/includes/classes/correct.php deleted file mode 100644 index 4a22f710a..000000000 --- a/library/includes/classes/correct.php +++ /dev/null @@ -1,3187 +0,0 @@ - 'cosmo' (2 первых и последняя буква — ошибочные) - * "\x78\x70\x65н" => 'хрен' (первые 3 буквы — ошибочные) - * "вебvfcnth" => 'вебмастер' - * "webьфыеук" => 'webmaster' - * "цццюмуыеш.ru" => 'www.vesti.ru' - * "\x54.\x43.\x48\x61вка" => 'Т.С.Навка' - * - * Hints - * Типичный пример алгоритма работы для поля ввода с автодополнением: - * 1. Сделать выборку по исходному запросу; - * 2. Если есть результат, возвратить его и исходный запрос; - * 3. Иначе скорректировать исходный запрос через Text_LangCorrect; - * 4. Если исходный и скорректированный запрос совпадает, возвратить пустой результат и исходный запрос; - * 5. Иначе сделать выборку по скорректированному запросу; - * 6. Возвратить результат. Если результат не пустой, возвратить скорректированный запрос, иначе исходный. - * - * License - * Только для некоммерческого использования! - * - * @link http://code.google.com/p/php-lang-correct/ - * @license http://creativecommons.org/licenses/by-nc-sa/3.0/ - * @author Nasibullin Rinat - * @version 1.4.3 - */ -class Text_LangCorrect -{ - /** - * Флаг для исправления ошибочно набранных букв в словах, - * которые выглядят одинаково в разных раскладках клавиатуры. - * Алгоритм работает достаточно надёжно и быстро. - */ - const SIMILAR_CHARS = 1; - - /** - * Флаг для исправления ошибочно набранных слов в другой раскладке клавиатуры. - * Алгоритм может иногда ошибаться, работает в разы медленнее, чем SIMILAR_CHARS. - */ - const KEYBOARD_LAYOUT = 2; - - /** - * Флаг для добавления исправлений, если влючён флаг KEYBOARD_LAYOUT - * Синтаксис и пример: "(,.cn=>бюст)" - * ^ ^^ ^ - */ - const ADD_FIX = 4; - - #английский (all) - private $en = '[a-zA-Z]'; - - #английский (uppercase) - private $en_uc = '[A-Z]'; - - #английский + символы, которые м.б. набраны по ошибке в английской раскладке клавиатуры вместо русских букв (all) - private $en_sc = '[a-zA-Z\'`~<>,.:;{}\[\]"]'; - - #символы, которые м.б. набраны по ошибке в английской раскладке клавиатуры вместо русских букв - private $sc = '[\'`~<>,.:;{}\[\]"]'; - private $no_sc = '[^\'`~<>,.:;{}\[\]"]'; - - #русский + татарский (all) - private $tt = '[\xd0-\xd3][\x80-\xbf] - (?<=\xd0[\x90-\xbf\x81]|\xd1[\x80-\x8f\x91]|\xd2[\x96\x97\xa2\xa3\xae\xaf\xba\xbb]|\xd3[\x98\x99\xa8\xa9])'; - - #русский + татарский (uppercase) - private $tt_uc = '[\xd0\xd2\xd3][\x81-\xba] - (?<=\xd0[\x90-\xaf\x81]|\xd2[\x96\xa2\xae\xba]|\xd3[\x98\xa8])'; - - #русский + татарский (для фильтрованных текстов) (all) - private $tt_f = '[\xd0-\xd3][\x80-\xbf] - #комментируем для увеличения скорости, т.к. остальные символы отфильтрованы - #(?<=\xd0[\x90-\xbf\x81]|\xd1[\x80-\x8f\x91]|\xd2[\x96\x97\xa2\xa3\xae\xaf\xba\xbb]|\xd3[\x98\x99\xa8\xa9]) - '; - - #гласная (vowel) (lowercase) - private $vowel_lc = array( - 'tt' => '\xd0[\xb0\xb5\xb8\xbe]|\xd1[\x83\x8b\x8d\x8e\x8f\x91] #аеиоуыэюяё (гласные, 10 шт.) - #| \xd0[\x90\x95\x98\x9e\xa3\xab\xad\xae\xaf\x81] #АЕИОУЫЭЮЯЁ (гласные, 10 шт.) - ', - 'en' => '[aeiouy]', #латинских 6 шт. - ); - - #согласная (consonant) + графические знаки для русского языка (ъ, ь) (lowercase) - private $consonant_lc = array( - 'tt' => '\xd0[\xb1-\xb4\xb6\xb7\xb9\xba-\xbd\xbf]|\xd1[\x80\x81\x82\x84-\x89\x8a\x8c] #бвгджзйклмнпрстфхцчшщ ъь (согласные, 21+2 шт.) - #| \xd0[\x91-\x94\x96\x97\x99\x9a-\x9d\x9f-\xa2\xa4-\xa9\xaa\xac] #БВГДЖЗЙКЛМНПРСТФХЦЧШЩ ЪЬ (согласные, 21+2 шт.) - ', - 'en' => '[bcdfghjklmnpqrstvwxz]', #латинских 20 шт. - ); - - private $words_exceptions = array( - 'tt' => array( - 'трлн' => null, - 'ющенко' => null, - 'мебельград' => null, - 'дэнис' => null, - ), - 'en' => array( - 'heuer' => null, - ), - ); - - #русские буквы, похожие на англ. (uppercase) - private $ru_similar_uc = "\xd0[\x90\x92\x95\x9a\x9c\x9d\x9e\xa0-\xa3\xa5]"; - - #русские буквы, похожие на англ. (all) - private $ru_similar = "\xd0[\x90\x92\x95\x9a\x9c\x9d\x9e\xa0-\xa3\xa5\xb0\xb5\xbe]|\xd1[\x80\x81\x83\x85]"; - - #англ. буквы, похожие на русские (uppercase) - private $en_similar_uc = '[ABEKMHOPCTYX]'; - - /* - #$tt_fake = '\xd0[\xb0\xb5\xbe\x90\x92\x95\x9a\x9c\x9d\x9e\xa0\xa1\xa2\xa3\xa5]|\xd1[\x80\x81\x83\x85]'; - $tt_fake = '[\xd0\xd1][\x80-\xbe] - (?<=\xd0[\xb0\xb5\xbe\x90\x92\x95\x9a\x9c\x9d\x9e\xa0\xa1\xa2\xa3\xa5]|\xd1[\x80\x81\x83\x85])'; - $en_fake = '[aeopcyxABEKMHOPCTYX]'; - */ - - #уникальные русские буквы - /* - CASE_UPPER, case_lower - "\xd0\x81", "\xd1\x91", #Ё ё - "\xd0\x91", "\xd0\xb1", #Б б - "\xd0\x92", "\xd0\xb2", #В в - "\xd0\x93", "\xd0\xb3", #Г г - "\xd0\x94", "\xd0\xb4", #Д д - "\xd0\x96", "\xd0\xb6", #Ж ж - "\xd0\x97", "\xd0\xb7", #З з - "\xd0\x98", "\xd0\xb8", #И и - "\xd0\x99", "\xd0\xb9", #Й й - "\xd0\xba", #К к - "\xd0\x9b", "\xd0\xbb", #Л л - "\xd0\xbd", #Н н - "\xd0\x9f", "\xd0\xbf", #П п - "\xd1\x82", #Т т - "\xd0\xa4", "\xd1\x84", #Ф ф - "\xd0\xa6", "\xd1\x86", #Ц ц - "\xd0\xa7", "\xd1\x87", #Ч ч - "\xd0\xa8", "\xd1\x88", #Ш ш - "\xd0\xa9", "\xd1\x89", #Щ щ - "\xd0\xaa", "\xd1\x8a", #Ъ ъ - "\xd0\xab", "\xd1\x8b", #Ы ы - "\xd0\xac", "\xd1\x8c", #Ь ь - "\xd0\xad", "\xd1\x8d", #Э э - "\xd0\xae", "\xd1\x8e", #Ю ю - "\xd0\xaf", "\xd1\x8f", #Я я - */ - #$tt_uniq = "\xd0[\xb1-\xb4\xb6-\xbb\xbd\xbf\x81\x91-\x94\x96-\x99\x9b\x9f\xa4\xa6-\xaf]|\xd1[\x82\x84\x86-\x8f\x91]"; - private $tt_uniq = "[\xd0\xd1][\x82-\xbf] - (?<=\xd0[\xb1-\xb4\xb6-\xbb\xbd\xbf\x81\x91-\x94\x96-\x99\x9b\x9f\xa4\xa6-\xaf]|\xd1[\x82\x84\x86-\x8f\x91])"; - - #уникальные латинские буквы - /* - CASE_UPPER, case_lower - "\x42", "\x62", #B b - "\x44", "\x64", #D d - "\x46", "\x66", #F f - "\x68", #H h - "\x49", "\x69", #I i - "\x4a", "\x6a", #J j - "\x6b", #K k - "\x4c", "\x6c", #L l - "\x6d", #M m - "\x4e", "\x6e", #N n - "\x51", "\x71", #Q q - "\x52", "\x72", #R r - "\x53", "\x73", #S s - "\x74", #T t - "\x55", "\x75", #U u - "\x56", "\x76", #V v - "\x57", "\x77", #W w - "\x5a", "\x7a", #Z z - */ - private $en_uniq = "[\x42\x44\x46\x49\x4a\x4c\x4e\x51\x52\x53\x55\x57\x56\x5a\x62\x64\x66\x68\x69\x6a-\x6e\x71-\x77\x7a]"; - - private $table_flip; #array - private $words; #corrected words - private $en_correct; #string - private $tt_correct; #string - private $mode; #bool - - private $is_flip = false; - private $method = 0; - - private $table = array( - #метод 0: таблица исправления ошибочно набранных букв, которые выглядят одинаково (русский <--> английский) - 0 => array( - #lowercase #UPPERCASE - "\xd0\xb0" => 'a', "\xd0\x90" => 'A', - "\xd0\x92" => 'B', - "\xd0\xb5" => 'e', "\xd0\x95" => 'E', - "\xd0\x9a" => 'K', - "\xd0\x9c" => 'M', - "\xd0\x9d" => 'H', - "\xd0\xbe" => 'o', "\xd0\x9e" => 'O', - "\xd1\x80" => 'p', "\xd0\xa0" => 'P', - "\xd1\x81" => 'c', "\xd0\xa1" => 'C', - "\xd0\xa2" => 'T', - "\xd1\x83" => 'y', "\xd0\xa3" => 'Y', - "\xd1\x85" => 'x', "\xd0\xa5" => 'X', - ), - #метод 1: таблица исправления ошибочно набранных букв в другой раскладке клавиатуры (русский <--> английский) - 1 => array( - #CASE_UPPER #case_lower - "\xd0\x81" => '~', "\xd1\x91" => '`', #Ё ё - "\xd0\x90" => 'F', "\xd0\xb0" => 'f', #А а - "\xd0\x91" => '<', "\xd0\xb1" => ',', #Б б - "\xd0\x92" => 'D', "\xd0\xb2" => 'd', #В в - "\xd0\x93" => 'U', "\xd0\xb3" => 'u', #Г г - "\xd0\x94" => 'L', "\xd0\xb4" => 'l', #Д д - "\xd0\x95" => 'T', "\xd0\xb5" => 't', #Е е - "\xd0\x96" => ':', "\xd0\xb6" => ';', #Ж ж - "\xd0\x97" => 'P', "\xd0\xb7" => 'p', #З з - "\xd0\x98" => 'B', "\xd0\xb8" => 'b', #И и - "\xd0\x99" => 'Q', "\xd0\xb9" => 'q', #Й й - "\xd0\x9a" => 'R', "\xd0\xba" => 'r', #К к - "\xd0\x9b" => 'K', "\xd0\xbb" => 'k', #Л л - "\xd0\x9c" => 'V', "\xd0\xbc" => 'v', #М м - "\xd0\x9d" => 'Y', "\xd0\xbd" => 'y', #Н н - "\xd0\x9e" => 'J', "\xd0\xbe" => 'j', #О о - "\xd0\x9f" => 'G', "\xd0\xbf" => 'g', #П п - #CASE_UPPER #case_lower - "\xd0\xa0" => 'H', "\xd1\x80" => 'h', #Р р - "\xd0\xa1" => 'C', "\xd1\x81" => 'c', #С с - "\xd0\xa2" => 'N', "\xd1\x82" => 'n', #Т т - "\xd0\xa3" => 'E', "\xd1\x83" => 'e', #У у - "\xd0\xa4" => 'A', "\xd1\x84" => 'a', #Ф ф - "\xd0\xa5" => '{', "\xd1\x85" => '[', #Х х - "\xd0\xa6" => 'W', "\xd1\x86" => 'w', #Ц ц - "\xd0\xa7" => 'X', "\xd1\x87" => 'x', #Ч ч - "\xd0\xa8" => 'I', "\xd1\x88" => 'i', #Ш ш - "\xd0\xa9" => 'O', "\xd1\x89" => 'o', #Щ щ - "\xd0\xaa" => '}', "\xd1\x8a" => ']', #Ъ ъ - "\xd0\xab" => 'S', "\xd1\x8b" => 's', #Ы ы - "\xd0\xac" => 'M', "\xd1\x8c" => 'm', #Ь ь - "\xd0\xad" => '"', "\xd1\x8d" => "'", #Э э - "\xd0\xae" => '>', "\xd1\x8e" => '.', #Ю ю - "\xd0\xaf" => 'Z', "\xd1\x8f" => 'z', #Я я - ), - ); - - #несуществующие N-граммы для гласных букв - private $vowels3_lc = array( - 'en' => array( - 'aea' => 0, - 'aei' => 1, - 'aeo' => 2, - 'aeu' => 3, - 'aia' => 4, - 'aie' => 5, - 'aii' => 6, - 'aoi' => 7, - 'aou' => 8, - 'aue' => 9, - 'aya' => 10, - 'aye' => 11, - 'ayi' => 12, - 'ayo' => 13, - 'ayu' => 14, - 'eae' => 15, - 'eau' => 16, - 'eea' => 17, - 'eei' => 18, - 'eeu' => 19, - 'eia' => 20, - 'eiu' => 21, - 'eoi' => 22, - 'eou' => 23, - 'eya' => 24, - 'eye' => 25, - 'eyi' => 26, - 'eyo' => 27, - 'iae' => 28, - 'iai' => 29, - 'iao' => 30, - 'iau' => 31, - 'iei' => 32, - 'ieu' => 33, - 'ioa' => 34, - 'ioe' => 35, - 'iou' => 36, - 'iya' => 37, - 'oae' => 38, - 'oea' => 39, - 'oei' => 40, - 'oeo' => 41, - 'oeu' => 42, - 'oey' => 43, - 'oia' => 44, - 'oie' => 45, - 'ooe' => 46, - 'ooi' => 47, - 'oou' => 48, - 'oua' => 49, - 'oue' => 50, - 'oui' => 51, - 'oya' => 52, - 'oye' => 53, - 'oyi' => 54, - 'oyo' => 55, - 'uae' => 56, - 'uai' => 57, - 'uay' => 58, - 'uea' => 59, - 'uee' => 60, - 'uei' => 61, - 'ueo' => 62, - 'ueu' => 63, - 'uey' => 64, - 'uia' => 65, - 'uie' => 66, - 'uio' => 67, - 'uiu' => 68, - 'uoa' => 69, - 'uoi' => 70, - 'uou' => 71, - 'uoy' => 72, - 'uya' => 73, - 'uye' => 74, - 'uyi' => 75, - 'yae' => 76, - 'yao' => 77, - 'yau' => 78, - 'yea' => 79, - 'yei' => 80, - 'yeo' => 81, - 'yey' => 82, - 'yie' => 83, - 'yoi' => 84, - 'you' => 85, - 'yoy' => 86, - 'yua' => 87, - ), - 'tt' => array( - 'аау' => 0, - 'аео' => 1, - 'аеу' => 2, - 'аиа' => 3, - 'аио' => 4, - 'аиу' => 5, - 'аои' => 6, - 'ауэ' => 7, - 'аяя' => 8, - 'еаэ' => 9, - 'еее' => 10, - 'еео' => 11, - 'еоа' => 12, - 'еои' => 13, - 'еоо' => 14, - 'еую' => 15, - 'еуя' => 16, - 'еуё' => 17, - 'иау' => 18, - 'иео' => 19, - 'иие' => 20, - 'иоа' => 21, - 'иои' => 22, - 'иоу' => 23, - 'иоэ' => 24, - 'ияе' => 25, - 'ияи' => 26, - 'ияю' => 27, - 'оаэ' => 28, - 'оео' => 29, - 'оею' => 30, - 'оие' => 31, - 'оуе' => 32, - 'оуя' => 33, - 'оюе' => 34, - 'оюю' => 35, - 'ояе' => 36, - 'уео' => 37, - 'уюю' => 38, - ), - ); - - #несуществующие N-граммы для согласных букв - private $consonants4_lc = array( - 'en' => array( - 'bldg' => 0, - 'blvd' => 1, - 'bscr' => 2, - 'bstr' => 3, - 'cbcm' => 4, - 'cbft' => 5, - 'chfr' => 6, - 'chmn' => 7, - 'chsc' => 8, - 'chsh' => 9, - 'chst' => 10, - 'chth' => 11, - 'chts' => 12, - 'ckbr' => 13, - 'ckch' => 14, - 'ckcl' => 15, - 'ckdr' => 16, - 'ckgr' => 17, - 'cksc' => 18, - 'cksf' => 19, - 'cksh' => 20, - 'cksk' => 21, - 'cksl' => 22, - 'cksm' => 23, - 'cksn' => 24, - 'cksp' => 25, - 'ckst' => 26, - 'cksw' => 27, - 'ckth' => 28, - 'cktr' => 29, - 'ckwh' => 30, - 'cmps' => 31, - 'dspr' => 32, - 'dstr' => 33, - 'dthw' => 34, - 'ffsc' => 35, - 'ffsh' => 36, - 'ffsp' => 37, - 'fthl' => 38, - 'ftsm' => 39, - 'ftsp' => 40, - 'gdns' => 41, - 'ghbr' => 42, - 'ghfl' => 43, - 'ghsh' => 44, - 'ghtb' => 45, - 'ghtc' => 46, - 'ghtf' => 47, - 'ghth' => 48, - 'ghtj' => 49, - 'ghtl' => 50, - 'ghtm' => 51, - 'ghtn' => 52, - 'ghtr' => 53, - 'ghts' => 54, - 'ghtw' => 55, - 'hdbk' => 56, - 'hnst' => 57, - 'jctn' => 58, - 'khsh' => 59, - 'khst' => 60, - 'lchr' => 61, - 'ldpr' => 62, - 'ldsh' => 63, - 'ldsm' => 64, - 'ldsp' => 65, - 'ldst' => 66, - 'lfsk' => 67, - 'lfth' => 68, - 'lgth' => 69, - 'llfl' => 70, - 'llfr' => 71, - 'llph' => 72, - 'llpl' => 73, - 'llsh' => 74, - 'llsp' => 75, - 'llst' => 76, - 'lltr' => 77, - 'llwr' => 78, - 'lmcr' => 79, - 'lmsm' => 80, - 'lnrk' => 81, - 'lnsh' => 82, - 'lptr' => 83, - 'lsgr' => 84, - 'lshm' => 85, - 'lshw' => 86, - 'lstr' => 87, - 'lthf' => 88, - 'ltsf' => 89, - 'ltsh' => 90, - 'ltst' => 91, - 'mbsc' => 92, - 'mbsh' => 93, - 'mbsk' => 94, - 'mbst' => 95, - 'mddx' => 96, - 'mdnt' => 97, - 'mpbl' => 98, - 'mpgr' => 99, - 'mphl' => 100, - 'mphr' => 101, - 'mpsh' => 102, - 'mpst' => 103, - 'mptl' => 104, - 'mptn' => 105, - 'mptr' => 106, - 'mpts' => 107, - 'mscr' => 108, - 'mstr' => 109, - 'nchb' => 110, - 'nchl' => 111, - 'nchm' => 112, - 'nchn' => 113, - 'nchp' => 114, - 'nchr' => 115, - 'nchw' => 116, - 'nctl' => 117, - 'nctn' => 118, - 'ndbk' => 119, - 'ndbr' => 120, - 'ndch' => 121, - 'ndfl' => 122, - 'ndgl' => 123, - 'ndgr' => 124, - 'ndsc' => 125, - 'ndsh' => 126, - 'ndsl' => 127, - 'ndsm' => 128, - 'ndsp' => 129, - 'ndst' => 130, - 'ndsw' => 131, - 'ndth' => 132, - 'ndwr' => 133, - 'ngcr' => 134, - 'ngsg' => 135, - 'ngsh' => 136, - 'ngsm' => 137, - 'ngsp' => 138, - 'ngst' => 139, - 'ngth' => 140, - 'ngtz' => 141, - 'nksg' => 142, - 'nksh' => 143, - 'nksm' => 144, - 'nkst' => 145, - 'nsch' => 146, - 'nscr' => 147, - 'nsgr' => 148, - 'nshr' => 149, - 'nskr' => 150, - 'nspl' => 151, - 'nspr' => 152, - 'nssh' => 153, - 'nstr' => 154, - 'ntbr' => 155, - 'nthl' => 156, - 'nthr' => 157, - 'nths' => 158, - 'ntsh' => 159, - 'ntsm' => 160, - 'phth' => 161, - 'pstr' => 162, - 'pthr' => 163, - 'pths' => 164, - 'ptwr' => 165, - 'rbst' => 166, - 'rchb' => 167, - 'rchd' => 168, - 'rchl' => 169, - 'rchm' => 170, - 'rchn' => 171, - 'rchp' => 172, - 'rchw' => 173, - 'rdsh' => 174, - 'rdsm' => 175, - 'rdst' => 176, - 'rghs' => 177, - 'rkpl' => 178, - 'rksc' => 179, - 'rksh' => 180, - 'rksk' => 181, - 'rksm' => 182, - 'rksp' => 183, - 'rkst' => 184, - 'rldl' => 185, - 'rldw' => 186, - 'rlfr' => 187, - 'rmch' => 188, - 'rmst' => 189, - 'rmth' => 190, - 'rnbl' => 191, - 'rndl' => 192, - 'rnsk' => 193, - 'rnsp' => 194, - 'rnst' => 195, - 'rsch' => 196, - 'rscr' => 197, - 'rshl' => 198, - 'rshn' => 199, - 'rspr' => 200, - 'rstl' => 201, - 'rstr' => 202, - 'rsts' => 203, - 'rstw' => 204, - 'rtbr' => 205, - 'rtch' => 206, - 'rtcr' => 207, - 'rthb' => 208, - 'rthc' => 209, - 'rthd' => 210, - 'rthf' => 211, - 'rthl' => 212, - 'rthm' => 213, - 'rthq' => 214, - 'rthr' => 215, - 'rths' => 216, - 'rthw' => 217, - 'rtsh' => 218, - 'rtsm' => 219, - 'rtsp' => 220, - 'rtsw' => 221, - 'schl' => 222, - 'schm' => 223, - 'schn' => 224, - 'schw' => 225, - 'scrp' => 226, - 'sgmt' => 227, - 'shcl' => 228, - 'shkh' => 229, - 'shpr' => 230, - 'shpt' => 231, - 'shst' => 232, - 'shtr' => 233, - 'shwh' => 234, - 'smth' => 235, - 'ssrs' => 236, - 'ssst' => 237, - 'sstd' => 238, - 'sstr' => 239, - 'stcr' => 240, - 'sthm' => 241, - 'stpl' => 242, - 'stpr' => 243, - 'stsc' => 244, - 'stwr' => 245, - 'tblt' => 246, - 'tchb' => 247, - 'tchc' => 248, - 'tchd' => 249, - 'tchf' => 250, - 'tchl' => 251, - 'tchm' => 252, - 'tchp' => 253, - 'tchw' => 254, - 'thdr' => 255, - 'thsh' => 256, - 'thsk' => 257, - 'thsp' => 258, - 'thst' => 259, - 'tsch' => 260, - 'tspr' => 261, - 'tstr' => 262, - 'tthr' => 263, - 'ttsb' => 264, - 'tzkr' => 265, - 'whsl' => 266, - 'wnbr' => 267, - 'wnpl' => 268, - 'wnsf' => 269, - 'wnsh' => 270, - 'wnsm' => 271, - 'wnsp' => 272, - 'wnst' => 273, - 'wnsw' => 274, - 'wnth' => 275, - 'wntr' => 276, - 'wrnt' => 277, - 'wsfl' => 278, - 'wspr' => 279, - 'wstr' => 280, - 'xthl' => 281, - ), - 'tt' => array( - 'блзд' => 0, - 'бльд' => 1, - 'брьс' => 2, - 'бств' => 3, - 'бстр' => 4, - 'взбл' => 5, - 'взбр' => 6, - 'взгл' => 7, - 'взгр' => 8, - 'вздв' => 9, - 'вздр' => 10, - 'врвг' => 11, - 'врск' => 12, - 'вскл' => 13, - 'вскр' => 14, - 'вспл' => 15, - 'вспр' => 16, - 'вств' => 17, - 'встр' => 18, - 'всхл' => 19, - 'всхр' => 20, - 'втск' => 21, - 'вхск' => 22, - 'грск' => 23, - 'гств' => 24, - 'гтст' => 25, - 'гшпр' => 26, - 'двзд' => 27, - 'джск' => 28, - 'дрст' => 29, - 'дскр' => 30, - 'дств' => 31, - 'дстр' => 32, - 'дтск' => 33, - 'жств' => 34, - 'звзд' => 35, - 'знст' => 36, - 'зтьс' => 37, - 'йздр' => 38, - 'йкбр' => 39, - 'йльн' => 40, - 'йншт' => 41, - 'йпфр' => 42, - 'йств' => 43, - 'йстр' => 44, - 'йтск' => 45, - 'йфст' => 46, - 'йхсв' => 47, - 'йхск' => 48, - 'йхср' => 49, - 'йхст' => 50, - 'кскл' => 51, - 'кскр' => 52, - 'кспл' => 53, - 'кспр' => 54, - 'кств' => 55, - 'кстн' => 56, - 'кстр' => 57, - 'лвст' => 58, - 'лжск' => 59, - 'лльн' => 60, - 'лльс' => 61, - 'лстр' => 62, - 'лсть' => 63, - 'льгв' => 64, - 'льдж' => 65, - 'льдк' => 66, - 'льдм' => 67, - 'льдс' => 68, - 'льдф' => 69, - 'льдц' => 70, - 'льдш' => 71, - 'льдъ' => 72, - 'льдь' => 73, - 'льзк' => 74, - 'льзн' => 75, - 'льзь' => 76, - 'лькл' => 77, - 'лькн' => 78, - 'льпн' => 79, - 'льпт' => 80, - 'льск' => 81, - 'льсн' => 82, - 'льст' => 83, - 'льтк' => 84, - 'льтм' => 85, - 'льтн' => 86, - 'льтп' => 87, - 'льтр' => 88, - 'льтс' => 89, - 'льтт' => 90, - 'льтф' => 91, - 'льфр' => 92, - 'льцг' => 93, - 'льчс' => 94, - 'льшб' => 95, - 'льшк' => 96, - 'льшн' => 97, - 'льшп' => 98, - 'льшф' => 99, - 'льшь' => 100, - 'мбль' => 101, - 'мбрс' => 102, - 'мвзв' => 103, - 'мздр' => 104, - 'мств' => 105, - 'мтск' => 106, - 'нгль' => 107, - 'нгст' => 108, - 'ндгр' => 109, - 'ндск' => 110, - 'ндсп' => 111, - 'ндшп' => 112, - 'ндшт' => 113, - 'нкск' => 114, - 'нктн' => 115, - 'нктс' => 116, - 'нсгр' => 117, - 'нскм' => 118, - 'нскр' => 119, - 'нспл' => 120, - 'нств' => 121, - 'нстк' => 122, - 'нстр' => 123, - 'нтгл' => 124, - 'нтль' => 125, - 'нтрб' => 126, - 'нтрв' => 127, - 'нтрг' => 128, - 'нтрд' => 129, - 'нтрм' => 130, - 'нтрн' => 131, - 'нтрп' => 132, - 'нтрр' => 133, - 'нтрф' => 134, - 'нтск' => 135, - 'нтст' => 136, - 'нфск' => 137, - 'нцкл' => 138, - 'нцпл' => 139, - 'нькн' => 140, - 'ньск' => 141, - 'ньчж' => 142, - 'псск' => 143, - 'пств' => 144, - 'птск' => 145, - 'рбск' => 146, - 'ргпр' => 147, - 'ргск' => 148, - 'ргфл' => 149, - 'рдск' => 150, - 'рдсм' => 151, - 'рдст' => 152, - 'рздр' => 153, - 'рзть' => 154, - 'ркгр' => 155, - 'ркск' => 156, - 'рктн' => 157, - 'рльс' => 158, - 'рмск' => 159, - 'рмтр' => 160, - 'рнск' => 161, - 'рпск' => 162, - 'рсдр' => 163, - 'рсск' => 164, - 'рств' => 165, - 'рстк' => 166, - 'рстн' => 167, - 'рстр' => 168, - 'рстс' => 169, - 'рсть' => 170, - 'ртвл' => 171, - 'ртвр' => 172, - 'ртгр' => 173, - 'рткр' => 174, - 'ртпл' => 175, - 'ртпр' => 176, - 'ртск' => 177, - 'ртсм' => 178, - 'ртшк' => 179, - 'ртьф' => 180, - 'рхзв' => 181, - 'рхпл' => 182, - 'рхпр' => 183, - 'рхсв' => 184, - 'рхск' => 185, - 'рхсм' => 186, - 'рхср' => 187, - 'рхтв' => 188, - 'рхшт' => 189, - 'рщвл' => 190, - 'рьмл' => 191, - 'скск' => 192, - 'спрь' => 193, - 'сспр' => 194, - 'ссср' => 195, - 'сств' => 196, - 'сстр' => 197, - 'ссшп' => 198, - 'ствл' => 199, - 'стрс' => 200, - 'стрш' => 201, - 'стск' => 202, - 'стьб' => 203, - 'стьд' => 204, - 'стьс' => 205, - 'ськн' => 206, - 'сьмн' => 207, - 'тмст' => 208, - 'тпрр' => 209, - 'трст' => 210, - 'тскр' => 211, - 'тств' => 212, - 'тстр' => 213, - 'ттль' => 214, - 'ттск' => 215, - 'тхск' => 216, - 'фств' => 217, - 'фстр' => 218, - 'хств' => 219, - 'хстр' => 220, - 'хткл' => 221, - 'хтск' => 222, - 'хтсм' => 223, - 'цстр' => 224, - ), - ); - - #несуществующие биграммы в начале и конце слов - private $bigrams = array( - #ru - ' ёё' => 0, - ' ёа' => 0, - ' ёб' => 0, - ' ёв' => 0, - ' ёг' => 0, - ' ёд' => 0, - ' ёе' => 0, - ' ёз' => 0, - ' ёи' => 0, - ' ёй' => 0, - ' ён' => 0, - ' ёо' => 0, - ' ёп' => 0, - ' ёс' => 0, - ' ёт' => 0, - ' ёу' => 0, - ' ёф' => 0, - ' ёц' => 0, - ' ёч' => 0, - ' ёщ' => 0, - ' ёъ' => 0, - ' ёы' => 0, - ' ёь' => 0, - ' ёэ' => 0, - ' ёю' => 0, - ' ёя' => 0, - ' аё' => 0, - ' аа' => 0, - ' ае' => 0, - ' ач' => 0, - ' аъ' => 0, - ' аы' => 0, - ' аь' => 0, - ' аю' => 0, - ' ая' => 0, - ' бб' => 0, - ' бв' => 0, - ' бг' => 0, - ' бж' => 0, - ' бй' => 0, - ' бк' => 0, - ' бм' => 0, - ' бн' => 0, - ' бп' => 0, - ' бс' => 0, - ' бт' => 0, - ' бф' => 0, - ' бх' => 0, - ' бц' => 0, - ' бч' => 0, - ' бш' => 0, - ' бщ' => 0, - ' бъ' => 0, - ' вй' => 0, - ' вф' => 0, - ' вщ' => 0, - ' вэ' => 0, - ' вю' => 0, - ' гё' => 0, - ' гб' => 0, - ' гз' => 0, - ' гй' => 0, - ' гк' => 0, - ' гп' => 0, - ' гс' => 0, - ' гт' => 0, - ' гф' => 0, - ' гх' => 0, - ' гц' => 0, - ' гч' => 0, - ' гш' => 0, - ' гщ' => 0, - ' гъ' => 0, - ' гь' => 0, - ' гэ' => 0, - ' дб' => 0, - ' дг' => 0, - ' дд' => 0, - ' дй' => 0, - ' дк' => 0, - ' дп' => 0, - ' дс' => 0, - ' дт' => 0, - ' дф' => 0, - ' дх' => 0, - ' дц' => 0, - ' дч' => 0, - ' дш' => 0, - ' дъ' => 0, - ' дэ' => 0, - ' еа' => 0, - ' еб' => 0, - ' еи' => 0, - ' ео' => 0, - ' ет' => 0, - ' еу' => 0, - ' ец' => 0, - #' еш' => 0, - ' еъ' => 0, - ' еы' => 0, - ' еь' => 0, - ' еэ' => 0, - ' ея' => 0, - ' жз' => 0, - ' жй' => 0, - ' жк' => 0, - ' жл' => 0, - ' жп' => 0, - ' жс' => 0, - ' жт' => 0, - ' жф' => 0, - ' жх' => 0, - ' жц' => 0, - ' жч' => 0, - ' жш' => 0, - ' жщ' => 0, - ' жъ' => 0, - ' жы' => 0, - ' жь' => 0, - ' жэ' => 0, - #' жю' => 0, - ' жя' => 0, - ' зб' => 0, - ' зж' => 0, - ' зз' => 0, - ' зй' => 0, - ' зк' => 0, - ' зп' => 0, - ' зс' => 0, - ' зт' => 0, - ' зф' => 0, - ' зх' => 0, - ' зц' => 0, - ' зч' => 0, - ' зш' => 0, - ' зщ' => 0, - ' зъ' => 0, - ' зь' => 0, - ' зэ' => 0, - ' иё' => 0, - ' иа' => 0, - ' иф' => 0, - ' иц' => 0, - ' иъ' => 0, - ' иы' => 0, - ' иь' => 0, - ' иэ' => 0, - ' ия' => 0, - ' йё' => 0, - ' йа' => 0, - ' йб' => 0, - ' йв' => 0, - ' йг' => 0, - ' йд' => 0, - ' йж' => 0, - ' йз' => 0, - ' йи' => 0, - ' йй' => 0, - ' йк' => 0, - ' йл' => 0, - ' йм' => 0, - ' йн' => 0, - ' йп' => 0, - ' йр' => 0, - ' йс' => 0, - ' йт' => 0, - ' йу' => 0, - ' йф' => 0, - ' йх' => 0, - ' йц' => 0, - ' йч' => 0, - ' йш' => 0, - ' йщ' => 0, - ' йъ' => 0, - ' йы' => 0, - ' йь' => 0, - ' йэ' => 0, - ' йю' => 0, - ' йя' => 0, - ' кё' => 0, - ' кб' => 0, - ' кд' => 0, - ' кж' => 0, - ' кй' => 0, - ' кк' => 0, - ' кф' => 0, - ' кц' => 0, - ' кч' => 0, - ' кщ' => 0, - ' къ' => 0, - ' кя' => 0, - ' лв' => 0, - ' лд' => 0, - ' лз' => 0, - ' лй' => 0, - ' лк' => 0, - ' лл' => 0, - ' лм' => 0, - ' лн' => 0, - ' лп' => 0, - ' лр' => 0, - ' лс' => 0, - ' лт' => 0, - ' лф' => 0, - ' лх' => 0, - ' лц' => 0, - ' лч' => 0, - ' лш' => 0, - ' лщ' => 0, - ' лъ' => 0, - ' лэ' => 0, - ' мб' => 0, - ' мв' => 0, - ' мд' => 0, - ' мж' => 0, - ' мй' => 0, - ' мк' => 0, - ' мп' => 0, - ' мт' => 0, - ' мф' => 0, - ' мц' => 0, - ' мъ' => 0, - ' мь' => 0, - ' нб' => 0, - ' нв' => 0, - ' нг' => 0, - ' нд' => 0, - ' нж' => 0, - ' нз' => 0, - ' нй' => 0, - ' нк' => 0, - ' нл' => 0, - ' нм' => 0, - ' нн' => 0, - ' нп' => 0, - ' нс' => 0, - ' нт' => 0, - ' нф' => 0, - ' нх' => 0, - ' нц' => 0, - ' нч' => 0, - ' нш' => 0, - ' нщ' => 0, - ' нъ' => 0, - ' оё' => 0, - ' ои' => 0, - ' оу' => 0, - ' оъ' => 0, - ' оы' => 0, - ' оь' => 0, - ' оэ' => 0, - ' оя' => 0, - ' пб' => 0, - ' пв' => 0, - ' пг' => 0, - ' пд' => 0, - ' пж' => 0, - ' пз' => 0, - ' пй' => 0, - ' пк' => 0, - ' пм' => 0, - ' пп' => 0, - ' пц' => 0, - ' пщ' => 0, - ' пъ' => 0, - ' рб' => 0, - ' рг' => 0, - ' рз' => 0, - ' рй' => 0, - ' рк' => 0, - ' рл' => 0, - ' рм' => 0, - ' рн' => 0, - ' рп' => 0, - ' рр' => 0, - ' рф' => 0, - ' рх' => 0, - ' рч' => 0, - ' рш' => 0, - ' рщ' => 0, - ' ръ' => 0, - ' сй' => 0, - ' сщ' => 0, - ' тб' => 0, - ' тг' => 0, - ' тд' => 0, - ' тж' => 0, - ' тз' => 0, - ' тй' => 0, - ' тн' => 0, - ' тт' => 0, - ' тх' => 0, - ' тц' => 0, - ' тч' => 0, - ' тш' => 0, - ' тъ' => 0, - ' уу' => 0, - ' уъ' => 0, - ' уы' => 0, - ' уь' => 0, - ' фб' => 0, - ' фв' => 0, - ' фг' => 0, - ' фд' => 0, - ' фж' => 0, - ' фз' => 0, - ' фй' => 0, - ' фк' => 0, - ' фм' => 0, - ' фн' => 0, - ' фп' => 0, - ' фс' => 0, - ' фх' => 0, - ' фц' => 0, - ' фч' => 0, - ' фш' => 0, - ' фщ' => 0, - ' фъ' => 0, - ' фэ' => 0, - ' фя' => 0, - ' хё' => 0, - ' хб' => 0, - ' хг' => 0, - ' хд' => 0, - ' хж' => 0, - ' хз' => 0, - ' хй' => 0, - ' хк' => 0, - ' хп' => 0, - ' хс' => 0, - ' хт' => 0, - ' хф' => 0, - ' хц' => 0, - ' хч' => 0, - ' хш' => 0, - ' хщ' => 0, - ' хъ' => 0, - ' хы' => 0, - ' хь' => 0, - #' хэ' => 0, - ' хю' => 0, - ' хя' => 0, - ' цё' => 0, - ' цб' => 0, - ' цг' => 0, - ' цд' => 0, - ' цж' => 0, - ' цй' => 0, - ' цл' => 0, - ' цм' => 0, - ' цн' => 0, - ' цп' => 0, - ' цр' => 0, - ' цс' => 0, - ' цт' => 0, - ' цф' => 0, - ' цх' => 0, - ' цц' => 0, - ' цч' => 0, - ' цш' => 0, - ' цщ' => 0, - ' цъ' => 0, - ' ць' => 0, - ' цэ' => 0, - ' цю' => 0, - ' ця' => 0, - ' чб' => 0, - ' чг' => 0, - ' чд' => 0, - ' чж' => 0, - ' чз' => 0, - ' чй' => 0, - ' чн' => 0, - ' чп' => 0, - ' чс' => 0, - ' чф' => 0, - ' чц' => 0, - ' чч' => 0, - ' чщ' => 0, - ' чъ' => 0, - ' чы' => 0, - ' чэ' => 0, - ' чю' => 0, - ' чя' => 0, - ' шб' => 0, - ' шг' => 0, - ' шд' => 0, - ' шж' => 0, - ' шз' => 0, - ' шй' => 0, - ' шс' => 0, - ' шф' => 0, - ' шц' => 0, - ' шч' => 0, - ' шщ' => 0, - ' шъ' => 0, - ' шы' => 0, - ' шэ' => 0, - ' шю' => 0, - ' шя' => 0, - ' щб' => 0, - ' щв' => 0, - ' щг' => 0, - ' щд' => 0, - ' щж' => 0, - ' щз' => 0, - ' щй' => 0, - ' щк' => 0, - ' щл' => 0, - ' щм' => 0, - ' щн' => 0, - ' що' => 0, - ' щп' => 0, - ' щр' => 0, - ' щс' => 0, - ' щт' => 0, - ' щф' => 0, - ' щх' => 0, - ' щц' => 0, - ' щч' => 0, - ' щш' => 0, - ' щщ' => 0, - ' щъ' => 0, - ' щы' => 0, - ' щь' => 0, - ' щэ' => 0, - ' щю' => 0, - ' щя' => 0, - ' ъё' => 0, - ' ъа' => 0, - ' ъб' => 0, - ' ъв' => 0, - ' ъг' => 0, - ' ъд' => 0, - ' ъе' => 0, - ' ъж' => 0, - ' ъз' => 0, - ' ъи' => 0, - ' ъй' => 0, - ' ък' => 0, - ' ъл' => 0, - ' ъм' => 0, - ' ън' => 0, - ' ъо' => 0, - ' ъп' => 0, - ' ър' => 0, - ' ъс' => 0, - ' ът' => 0, - ' ъу' => 0, - ' ъф' => 0, - ' ъх' => 0, - ' ъц' => 0, - ' ъч' => 0, - ' ъш' => 0, - ' ъщ' => 0, - ' ъъ' => 0, - ' ъы' => 0, - ' ъь' => 0, - ' ъэ' => 0, - ' ъю' => 0, - ' ъя' => 0, - ' ыё' => 0, - ' ыа' => 0, - ' ыб' => 0, - ' ыв' => 0, - ' ыг' => 0, - ' ыд' => 0, - ' ые' => 0, - ' ыж' => 0, - ' ыз' => 0, - ' ыи' => 0, - ' ый' => 0, - ' ык' => 0, - ' ыл' => 0, - ' ын' => 0, - ' ыо' => 0, - ' ып' => 0, - ' ыр' => 0, - ' ыс' => 0, - ' ыт' => 0, - ' ыу' => 0, - ' ыф' => 0, - ' ых' => 0, - ' ыц' => 0, - ' ыч' => 0, - ' ыш' => 0, - ' ыщ' => 0, - ' ыъ' => 0, - ' ыы' => 0, - ' ыь' => 0, - ' ыэ' => 0, - ' ыю' => 0, - ' ыя' => 0, - ' ьё' => 0, - ' ьа' => 0, - ' ьб' => 0, - ' ьв' => 0, - ' ьг' => 0, - ' ьд' => 0, - ' ье' => 0, - ' ьж' => 0, - ' ьз' => 0, - ' ьи' => 0, - ' ьй' => 0, - ' ьк' => 0, - ' ьл' => 0, - ' ьм' => 0, - ' ьн' => 0, - ' ьо' => 0, - ' ьп' => 0, - ' ьр' => 0, - ' ьс' => 0, - ' ьт' => 0, - ' ьу' => 0, - ' ьф' => 0, - ' ьх' => 0, - ' ьц' => 0, - ' ьч' => 0, - ' ьш' => 0, - ' ьщ' => 0, - ' ьъ' => 0, - ' ьы' => 0, - ' ьь' => 0, - ' ьэ' => 0, - ' ью' => 0, - ' ья' => 0, - ' эё' => 0, - ' эа' => 0, - ' эе' => 0, - ' эи' => 0, - ' эц' => 0, - ' эч' => 0, - ' эщ' => 0, - ' эъ' => 0, - ' эы' => 0, - ' эь' => 0, - ' ээ' => 0, - ' эю' => 0, - ' юё' => 0, - ' юе' => 0, - ' юи' => 0, - ' юй' => 0, - ' юо' => 0, - ' юу' => 0, - ' юц' => 0, - ' юш' => 0, - ' ющ' => 0, - ' юъ' => 0, - ' юы' => 0, - ' юь' => 0, - ' юэ' => 0, - ' юя' => 0, - ' яё' => 0, - ' яа' => 0, - ' яе' => 0, - ' яж' => 0, - ' яо' => 0, - ' яу' => 0, - ' яф' => 0, - ' яц' => 0, - ' яъ' => 0, - ' яы' => 0, - ' яь' => 0, - ' яэ' => 0, - ' яю' => 0, - ' яя' => 0, - 'ёё' => 0, - 'ёё ' => 0, - 'ёа' => 0, - 'ёа ' => 0, - 'ёг ' => 0, - 'ёе' => 0, - 'ёе ' => 0, - 'ёи' => 0, - 'ёи ' => 0, - 'ёй' => 0, - 'ёо' => 0, - 'ёо ' => 0, - 'ёу' => 0, - 'ёу ' => 0, - 'ёф' => 0, - 'ёф ' => 0, - 'ёц ' => 0, - 'ёч ' => 0, - 'ёщ ' => 0, - 'ёъ' => 0, - 'ёъ ' => 0, - 'ёы' => 0, - 'ёы ' => 0, - 'ёь' => 0, - 'ёь ' => 0, - 'ёэ' => 0, - 'ёэ ' => 0, - 'ёю' => 0, - 'ёя' => 0, - 'ёя ' => 0, - 'аё ' => 0, - 'аа ' => 0, - 'аъ' => 0, - 'аъ ' => 0, - 'аы' => 0, - 'аы ' => 0, - 'аь' => 0, - 'аь ' => 0, - 'аэ ' => 0, - 'бё ' => 0, - 'бб ' => 0, - 'бв ' => 0, - 'бг ' => 0, - 'бд ' => 0, - 'бж ' => 0, - 'бз ' => 0, - 'бй' => 0, - 'бй ' => 0, - 'бк ' => 0, - 'бм ' => 0, - 'бн ' => 0, - 'бп ' => 0, - 'бт ' => 0, - 'бф ' => 0, - 'бх ' => 0, - 'бц ' => 0, - 'бч ' => 0, - 'бш ' => 0, - 'бщ ' => 0, - 'бъ ' => 0, - 'бэ ' => 0, - 'вё ' => 0, - 'вб ' => 0, - 'вв ' => 0, - 'вд ' => 0, - 'вж' => 0, - 'вж ' => 0, - 'вз ' => 0, - 'вй' => 0, - 'вй ' => 0, - 'вл ' => 0, - 'вп ' => 0, - 'вф ' => 0, - 'вц ' => 0, - 'вч ' => 0, - 'вщ ' => 0, - 'въ' => 0, - 'въ ' => 0, - 'вэ ' => 0, - 'гё' => 0, - 'гё ' => 0, - 'гб ' => 0, - 'гг ' => 0, - 'гж' => 0, - 'гж ' => 0, - 'гз ' => 0, - 'гй' => 0, - 'гй ' => 0, - 'гк ' => 0, - 'гн ' => 0, - 'гп ' => 0, - 'гф ' => 0, - 'гх' => 0, - 'гх ' => 0, - 'гц' => 0, - 'гц ' => 0, - 'гч ' => 0, - 'гш ' => 0, - 'гщ ' => 0, - 'гъ' => 0, - 'гъ ' => 0, - 'гы ' => 0, - 'гь' => 0, - 'гь ' => 0, - 'гэ ' => 0, - 'гю' => 0, - 'гю ' => 0, - 'гя' => 0, - 'гя ' => 0, - 'дё ' => 0, - 'дб ' => 0, - 'дг ' => 0, - 'дд ' => 0, - 'дй' => 0, - 'дй ' => 0, - 'дк ' => 0, - 'дм ' => 0, - 'дн ' => 0, - 'дп ' => 0, - 'дс ' => 0, - 'дф ' => 0, - 'дх ' => 0, - 'дц ' => 0, - 'дч ' => 0, - 'дш ' => 0, - 'дщ ' => 0, - 'дъ ' => 0, - 'еа ' => 0, - 'еу ' => 0, - 'еъ' => 0, - 'еъ ' => 0, - 'еы' => 0, - 'еы ' => 0, - 'еь' => 0, - 'еь ' => 0, - 'еэ ' => 0, - 'жё ' => 0, - 'жв ' => 0, - 'жг ' => 0, - 'жж ' => 0, - 'жз ' => 0, - 'жй' => 0, - 'жй ' => 0, - 'жк ' => 0, - 'жл ' => 0, - 'жн ' => 0, - 'жп ' => 0, - 'жр ' => 0, - 'жс ' => 0, - 'жт ' => 0, - 'жф ' => 0, - 'жх ' => 0, - 'жц ' => 0, - 'жч ' => 0, - 'жш' => 0, - 'жш ' => 0, - 'жщ' => 0, - 'жщ ' => 0, - 'жъ' => 0, - 'жъ ' => 0, - 'жы ' => 0, - 'жэ ' => 0, - 'жю' => 0, - 'жю ' => 0, - 'жя' => 0, - 'жя ' => 0, - 'зё ' => 0, - 'зж ' => 0, - 'зз ' => 0, - 'зй' => 0, - 'зй ' => 0, - 'зк ' => 0, - 'зп ' => 0, - 'зр ' => 0, - 'зс ' => 0, - 'зт ' => 0, - 'зф' => 0, - 'зф ' => 0, - 'зх' => 0, - 'зх ' => 0, - 'зц ' => 0, - 'зч ' => 0, - 'зш ' => 0, - 'зщ' => 0, - 'зщ ' => 0, - 'зъ ' => 0, - 'зэ ' => 0, - 'иъ' => 0, - 'иъ ' => 0, - 'иы' => 0, - 'иы ' => 0, - 'иь' => 0, - 'иь ' => 0, - 'иэ ' => 0, - 'йё' => 0, - 'йё ' => 0, - 'йа ' => 0, - 'йв ' => 0, - 'йг ' => 0, - 'йж' => 0, - 'йж ' => 0, - 'йз ' => 0, - 'йи ' => 0, - 'йй' => 0, - 'йй ' => 0, - 'йо ' => 0, - 'йу' => 0, - 'йу ' => 0, - 'йч ' => 0, - 'йш ' => 0, - 'йщ ' => 0, - 'йъ' => 0, - 'йъ ' => 0, - 'йы' => 0, - 'йы ' => 0, - 'йь' => 0, - 'йь ' => 0, - 'йэ' => 0, - 'йэ ' => 0, - 'йю' => 0, - 'йю ' => 0, - 'йя' => 0, - 'кё ' => 0, - 'кб ' => 0, - 'кг ' => 0, - 'кд ' => 0, - 'кж ' => 0, - 'кз ' => 0, - 'кй' => 0, - 'кй ' => 0, - 'км ' => 0, - 'кн ' => 0, - 'кф ' => 0, - 'кц ' => 0, - 'кч ' => 0, - 'кш ' => 0, - 'кщ' => 0, - 'кщ ' => 0, - 'къ' => 0, - 'къ ' => 0, - 'кы ' => 0, - 'кь ' => 0, - 'кэ' => 0, - 'кэ ' => 0, - 'кя' => 0, - 'кя ' => 0, - 'лв ' => 0, - 'лж ' => 0, - 'лз ' => 0, - 'лй' => 0, - 'лй ' => 0, - 'лр ' => 0, - 'лф ' => 0, - 'лх ' => 0, - 'лц ' => 0, - 'лч ' => 0, - 'лш ' => 0, - 'лщ ' => 0, - 'лъ' => 0, - 'лъ ' => 0, - 'лэ' => 0, - 'лэ ' => 0, - 'мё ' => 0, - 'мв ' => 0, - 'мг ' => 0, - 'мд ' => 0, - 'мз ' => 0, - 'мй' => 0, - 'мк ' => 0, - 'мл ' => 0, - 'мр ' => 0, - 'мх ' => 0, - 'мц ' => 0, - 'мч ' => 0, - 'мш ' => 0, - 'мщ ' => 0, - 'мъ' => 0, - 'мъ ' => 0, - 'мэ ' => 0, - 'мю ' => 0, - 'нё ' => 0, - 'нб ' => 0, - 'нв ' => 0, - 'нй' => 0, - 'нл ' => 0, - 'нп ' => 0, - 'нщ ' => 0, - 'нъ ' => 0, - 'нэ ' => 0, - 'оъ' => 0, - 'оъ ' => 0, - 'оы' => 0, - 'оы ' => 0, - 'оь' => 0, - 'оь ' => 0, - 'пё ' => 0, - 'пб ' => 0, - 'пв' => 0, - 'пв ' => 0, - 'пг' => 0, - 'пг ' => 0, - 'пд ' => 0, - 'пж' => 0, - 'пж ' => 0, - 'пз' => 0, - 'пз ' => 0, - 'пй' => 0, - 'пй ' => 0, - 'пк ' => 0, - 'пл ' => 0, - 'пм ' => 0, - 'пн ' => 0, - 'пф ' => 0, - 'пх ' => 0, - 'пц ' => 0, - 'пч ' => 0, - 'пш ' => 0, - 'пщ ' => 0, - 'пъ' => 0, - 'пъ ' => 0, - 'пэ' => 0, - 'пэ ' => 0, - 'пю ' => 0, - 'рё ' => 0, - 'рй' => 0, - 'рй ' => 0, - 'ръ' => 0, - 'ръ ' => 0, - 'рэ ' => 0, - 'сб ' => 0, - 'св ' => 0, - 'сг ' => 0, - 'сд ' => 0, - 'сж ' => 0, - 'сз' => 0, - 'сз ' => 0, - 'сй' => 0, - 'сй ' => 0, - 'сн ' => 0, - 'сп ' => 0, - 'сф ' => 0, - 'сц ' => 0, - 'сч ' => 0, - 'сш ' => 0, - 'сщ ' => 0, - 'съ ' => 0, - 'сэ ' => 0, - 'тб ' => 0, - 'тг ' => 0, - 'тд ' => 0, - 'тж ' => 0, - 'тз ' => 0, - 'тй' => 0, - 'тй ' => 0, - 'тк ' => 0, - 'тл ' => 0, - 'тп ' => 0, - 'тф ' => 0, - 'тх ' => 0, - 'тц ' => 0, - 'тш ' => 0, - 'тщ ' => 0, - 'тъ ' => 0, - 'уё ' => 0, - 'уо ' => 0, - 'уу ' => 0, - 'уц ' => 0, - 'уъ' => 0, - 'уъ ' => 0, - 'уы' => 0, - 'уы ' => 0, - 'уь' => 0, - 'уь ' => 0, - 'уэ ' => 0, - 'фё ' => 0, - 'фб ' => 0, - 'фв ' => 0, - 'фг ' => 0, - 'фд ' => 0, - 'фж' => 0, - 'фж ' => 0, - 'фз' => 0, - 'фз ' => 0, - 'фй' => 0, - 'фй ' => 0, - 'фк ' => 0, - 'фл ' => 0, - 'фн ' => 0, - 'фп' => 0, - 'фп ' => 0, - 'фс ' => 0, - 'фх' => 0, - 'фх ' => 0, - 'фц' => 0, - 'фц ' => 0, - 'фч ' => 0, - 'фш ' => 0, - 'фщ ' => 0, - 'фъ' => 0, - 'фъ ' => 0, - 'фэ' => 0, - 'фэ ' => 0, - 'фю ' => 0, - 'хё' => 0, - 'хё ' => 0, - 'хб ' => 0, - 'хг ' => 0, - 'хд ' => 0, - 'хж ' => 0, - 'хз ' => 0, - 'хй' => 0, - 'хй ' => 0, - 'хк ' => 0, - 'хн ' => 0, - 'хп ' => 0, - 'хр ' => 0, - 'хс ' => 0, - 'хф ' => 0, - 'хх ' => 0, - 'хц ' => 0, - 'хч ' => 0, - 'хш ' => 0, - 'хщ' => 0, - 'хщ ' => 0, - 'хъ ' => 0, - 'хы' => 0, - 'хы ' => 0, - 'хь' => 0, - 'хь ' => 0, - 'хэ ' => 0, - 'хю' => 0, - 'хю ' => 0, - 'хя' => 0, - 'хя ' => 0, - 'цё' => 0, - 'цё ' => 0, - 'цб' => 0, - 'цб ' => 0, - 'цв ' => 0, - 'цг ' => 0, - 'цд ' => 0, - 'цж' => 0, - 'цж ' => 0, - 'цз ' => 0, - 'цй' => 0, - 'цй ' => 0, - 'цк ' => 0, - 'цл ' => 0, - 'цм ' => 0, - 'цн ' => 0, - 'цп ' => 0, - 'цр ' => 0, - 'цс ' => 0, - 'цт ' => 0, - 'цф' => 0, - 'цф ' => 0, - 'цх' => 0, - 'цх ' => 0, - 'цц ' => 0, - 'цч' => 0, - 'цч ' => 0, - 'цш ' => 0, - 'цщ' => 0, - 'цщ ' => 0, - 'цъ' => 0, - 'цъ ' => 0, - 'ць' => 0, - 'ць ' => 0, - 'цэ' => 0, - 'цэ ' => 0, - 'цю' => 0, - 'цю ' => 0, - 'ця' => 0, - 'ця ' => 0, - 'чё ' => 0, - 'чб ' => 0, - 'чг' => 0, - 'чг ' => 0, - 'чд' => 0, - 'чд ' => 0, - 'чж ' => 0, - 'чз' => 0, - 'чз ' => 0, - 'чй' => 0, - 'чй ' => 0, - 'чк ' => 0, - 'чл ' => 0, - 'чм ' => 0, - 'чн ' => 0, - 'чп' => 0, - 'чп ' => 0, - 'чр ' => 0, - 'чс ' => 0, - 'чф' => 0, - 'чф ' => 0, - 'чх ' => 0, - 'чц ' => 0, - 'чч ' => 0, - 'чш ' => 0, - 'чщ' => 0, - 'чщ ' => 0, - 'чъ' => 0, - 'чъ ' => 0, - 'чы' => 0, - 'чы ' => 0, - 'чэ' => 0, - 'чэ ' => 0, - 'чю' => 0, - 'чю ' => 0, - 'чя' => 0, - 'чя ' => 0, - 'шё ' => 0, - 'шб ' => 0, - 'шг ' => 0, - 'шд' => 0, - 'шд ' => 0, - 'шж' => 0, - 'шж ' => 0, - 'шз' => 0, - 'шз ' => 0, - 'шй' => 0, - 'шй ' => 0, - 'шк ' => 0, - 'шл ' => 0, - 'шм ' => 0, - 'шн ' => 0, - 'шп ' => 0, - 'шр ' => 0, - 'шс ' => 0, - 'шф ' => 0, - 'шх' => 0, - 'шх ' => 0, - 'шч ' => 0, - 'шш' => 0, - 'шш ' => 0, - 'шщ' => 0, - 'шщ ' => 0, - 'шъ' => 0, - 'шъ ' => 0, - 'шы' => 0, - 'шы ' => 0, - 'шэ' => 0, - 'шэ ' => 0, - 'шя' => 0, - 'шя ' => 0, - 'щб' => 0, - 'щб ' => 0, - 'щв ' => 0, - 'щг' => 0, - 'щг ' => 0, - 'щд' => 0, - 'щд ' => 0, - 'щж' => 0, - 'щж ' => 0, - 'щз' => 0, - 'щз ' => 0, - 'щй' => 0, - 'щй ' => 0, - 'щк' => 0, - 'щк ' => 0, - 'щл' => 0, - 'щл ' => 0, - 'щм ' => 0, - 'щн ' => 0, - 'щп' => 0, - 'щп ' => 0, - 'щр ' => 0, - 'щс' => 0, - 'щс ' => 0, - 'щт' => 0, - 'щт ' => 0, - 'щф' => 0, - 'щф ' => 0, - 'щх' => 0, - 'щх ' => 0, - 'щц' => 0, - 'щц ' => 0, - 'щч' => 0, - 'щч ' => 0, - 'щш' => 0, - 'щш ' => 0, - 'щщ' => 0, - 'щщ ' => 0, - 'щъ' => 0, - 'щъ ' => 0, - 'щы' => 0, - 'щы ' => 0, - 'щэ' => 0, - 'щэ ' => 0, - 'щю' => 0, - 'щю ' => 0, - 'щя' => 0, - 'щя ' => 0, - 'ъё ' => 0, - 'ъа' => 0, - 'ъа ' => 0, - 'ъб' => 0, - 'ъб ' => 0, - 'ъв' => 0, - 'ъв ' => 0, - 'ъг' => 0, - 'ъг ' => 0, - 'ъд' => 0, - 'ъд ' => 0, - 'ъе ' => 0, - 'ъж' => 0, - 'ъж ' => 0, - 'ъз' => 0, - 'ъз ' => 0, - 'ъи' => 0, - 'ъи ' => 0, - 'ъй' => 0, - 'ъй ' => 0, - 'ък' => 0, - 'ък ' => 0, - 'ъл' => 0, - 'ъл ' => 0, - 'ъм' => 0, - 'ъм ' => 0, - 'ън' => 0, - 'ън ' => 0, - 'ъо' => 0, - 'ъо ' => 0, - 'ъп' => 0, - 'ъп ' => 0, - 'ър' => 0, - 'ър ' => 0, - 'ъс' => 0, - 'ъс ' => 0, - 'ът' => 0, - 'ът ' => 0, - 'ъу' => 0, - 'ъу ' => 0, - 'ъф' => 0, - 'ъф ' => 0, - 'ъх' => 0, - 'ъх ' => 0, - 'ъц' => 0, - 'ъц ' => 0, - 'ъч' => 0, - 'ъч ' => 0, - 'ъш' => 0, - 'ъш ' => 0, - 'ъщ' => 0, - 'ъщ ' => 0, - 'ъъ' => 0, - 'ъъ ' => 0, - 'ъы' => 0, - 'ъы ' => 0, - 'ъь' => 0, - 'ъь ' => 0, - 'ъэ' => 0, - 'ъэ ' => 0, - 'ъю ' => 0, - 'ъя ' => 0, - 'ыё' => 0, - 'ыё ' => 0, - 'ыа' => 0, - 'ыа ' => 0, - 'ыи ' => 0, - 'ыо ' => 0, - 'ыу ' => 0, - 'ыф ' => 0, - 'ыъ' => 0, - 'ыъ ' => 0, - 'ыы' => 0, - 'ыы ' => 0, - 'ыь' => 0, - 'ыь ' => 0, - 'ыэ' => 0, - 'ыэ ' => 0, - 'ыю ' => 0, - 'ьа' => 0, - 'ьа ' => 0, - 'ьв ' => 0, - 'ьг ' => 0, - 'ьж ' => 0, - 'ьз ' => 0, - 'ьй' => 0, - 'ьй ' => 0, - 'ьл ' => 0, - 'ьн ' => 0, - 'ьр ' => 0, - 'ьу' => 0, - 'ьу ' => 0, - 'ьх ' => 0, - 'ьщ ' => 0, - 'ьъ' => 0, - 'ьъ ' => 0, - 'ьы ' => 0, - 'ьь' => 0, - 'ьь ' => 0, - 'ьэ ' => 0, - 'эё' => 0, - 'эё ' => 0, - 'эа' => 0, - 'эа ' => 0, - 'эб' => 0, - 'эб ' => 0, - 'эв ' => 0, - 'эг ' => 0, - 'эд ' => 0, - 'эе' => 0, - 'эе ' => 0, - 'эж' => 0, - 'эж ' => 0, - 'эз ' => 0, - 'эи ' => 0, - 'эй ' => 0, - 'эл ' => 0, - 'эм ' => 0, - 'эн ' => 0, - 'эо' => 0, - 'эо ' => 0, - 'эу' => 0, - 'эу ' => 0, - 'эф ' => 0, - 'эх ' => 0, - 'эц' => 0, - 'эц ' => 0, - 'эч' => 0, - 'эч ' => 0, - 'эш ' => 0, - 'эщ' => 0, - 'эщ ' => 0, - 'эъ' => 0, - 'эъ ' => 0, - 'эы' => 0, - 'эы ' => 0, - 'эь' => 0, - 'эь ' => 0, - 'ээ ' => 0, - 'эю' => 0, - 'эю ' => 0, - 'эя' => 0, - 'эя ' => 0, - 'юё' => 0, - 'юё ' => 0, - 'юа ' => 0, - 'юе ' => 0, - 'юж ' => 0, - 'юи ' => 0, - 'юл ' => 0, - 'юо ' => 0, - 'юу' => 0, - 'юу ' => 0, - 'юц ' => 0, - 'юъ' => 0, - 'юъ ' => 0, - 'юы' => 0, - 'юы ' => 0, - 'юь' => 0, - 'юь ' => 0, - 'юэ ' => 0, - 'юя' => 0, - 'яё' => 0, - 'яё ' => 0, - 'яа' => 0, - 'яа ' => 0, - 'яе ' => 0, - 'яо ' => 0, - 'яф' => 0, - 'яф ' => 0, - 'яъ' => 0, - 'яъ ' => 0, - 'яы' => 0, - 'яы ' => 0, - 'яь' => 0, - 'яь ' => 0, - 'яэ' => 0, - 'яэ ' => 0, - #en - ' \'f' => 0, - ' \'p' => 0, - ' \'q' => 0, - ' \'r' => 0, - ' \'x' => 0, - ' \'y' => 0, - ' \'z' => 0, - ' bj' => 0, - ' bq' => 0, - ' bz' => 0, - ' c\'' => 0, - ' cq' => 0, - ' cv' => 0, - ' cx' => 0, - ' dq' => 0, - ' dx' => 0, - ' ez' => 0, - ' f\'' => 0, - ' fh' => 0, - ' fk' => 0, - ' fq' => 0, - ' fv' => 0, - ' fw' => 0, - ' fz' => 0, - ' g\'' => 0, - ' gf' => 0, - ' gg' => 0, - ' gj' => 0, - ' gv' => 0, - ' gx' => 0, - ' gz' => 0, - ' h\'' => 0, - ' hj' => 0, - ' hk' => 0, - ' hn' => 0, - ' hq' => 0, - ' hx' => 0, - ' iq' => 0, - ' iw' => 0, - ' iy' => 0, - ' jb' => 0, - ' jf' => 0, - ' jh' => 0, - ' jj' => 0, - ' jk' => 0, - ' jl' => 0, - ' jm' => 0, - ' jq' => 0, - ' jw' => 0, - ' jx' => 0, - ' jy' => 0, - ' jz' => 0, - ' k\'' => 0, - ' kf' => 0, - ' kj' => 0, - ' kq' => 0, - ' kt' => 0, - ' kx' => 0, - ' kz' => 0, - ' lj' => 0, - ' lk' => 0, - ' lq' => 0, - ' lv' => 0, - ' mj' => 0, - ' mq' => 0, - ' mz' => 0, - ' nj' => 0, - ' nk' => 0, - ' nq' => 0, - ' nz' => 0, - ' oq' => 0, - ' pj' => 0, - ' pz' => 0, - ' qb' => 0, - ' qe' => 0, - ' qf' => 0, - ' qg' => 0, - ' qh' => 0, - ' qj' => 0, - ' qk' => 0, - ' qo' => 0, - ' qp' => 0, - ' qs' => 0, - ' qv' => 0, - ' qx' => 0, - ' qy' => 0, - ' qz' => 0, - ' rb' => 0, - ' rk' => 0, - ' rq' => 0, - ' rv' => 0, - ' rx' => 0, - ' rz' => 0, - ' sz' => 0, - ' tf' => 0, - ' tg' => 0, - ' tj' => 0, - ' tq' => 0, - ' u\'' => 0, - ' ue' => 0, - ' uj' => 0, - ' uo' => 0, - ' uq' => 0, - ' uu' => 0, - ' uy' => 0, - ' vb' => 0, - ' vj' => 0, - ' vk' => 0, - ' vn' => 0, - ' vq' => 0, - ' vr' => 0, - ' vv' => 0, - ' vw' => 0, - ' vx' => 0, - ' vy' => 0, - ' vz' => 0, - ' wj' => 0, - ' wl' => 0, - ' wn' => 0, - ' wq' => 0, - ' wx' => 0, - ' wz' => 0, - ' xb' => 0, - ' xf' => 0, - ' xg' => 0, - ' xh' => 0, - ' xj' => 0, - ' xk' => 0, - ' xq' => 0, - ' xt' => 0, - ' xu' => 0, - ' xz' => 0, - ' yf' => 0, - ' yg' => 0, - ' yh' => 0, - ' yj' => 0, - ' yk' => 0, - ' yl' => 0, - ' yn' => 0, - ' yq' => 0, - ' yv' => 0, - ' yx' => 0, - ' yy' => 0, - ' yz' => 0, - ' z\'' => 0, - ' zb' => 0, - ' zc' => 0, - ' zd' => 0, - ' zf' => 0, - ' zg' => 0, - ' zh' => 0, - ' zj' => 0, - ' zk' => 0, - ' zl' => 0, - ' zm' => 0, - ' zq' => 0, - ' zr' => 0, - ' zv' => 0, - ' zw' => 0, - ' zx' => 0, - ' zz' => 0, - '\'a ' => 0, - '\'b' => 0, - '\'b ' => 0, - '\'c ' => 0, - '\'f' => 0, - '\'f ' => 0, - '\'g' => 0, - '\'g ' => 0, - '\'h ' => 0, - '\'i ' => 0, - '\'j' => 0, - '\'j ' => 0, - '\'k' => 0, - '\'k ' => 0, - '\'l ' => 0, - '\'n ' => 0, - '\'o ' => 0, - '\'p ' => 0, - '\'q' => 0, - '\'q ' => 0, - '\'r ' => 0, - '\'u' => 0, - '\'u ' => 0, - '\'v ' => 0, - '\'w ' => 0, - '\'x' => 0, - '\'x ' => 0, - '\'z' => 0, - '\'z ' => 0, - 'b\' ' => 0, - 'bg ' => 0, - 'bh ' => 0, - 'bp ' => 0, - 'bq' => 0, - 'bq ' => 0, - 'bv ' => 0, - 'bx' => 0, - 'bz' => 0, - 'bz ' => 0, - 'c\' ' => 0, - 'cf ' => 0, - 'cj' => 0, - 'cn ' => 0, - 'cq ' => 0, - 'cv' => 0, - 'cw' => 0, - 'cx' => 0, - 'cx ' => 0, - 'cz ' => 0, - 'db ' => 0, - 'dj ' => 0, - 'dk ' => 0, - 'dw ' => 0, - 'dx' => 0, - 'eh ' => 0, - 'ej ' => 0, - 'f\' ' => 0, - 'fg ' => 0, - 'fh ' => 0, - 'fj' => 0, - 'fj ' => 0, - 'fk' => 0, - 'fk ' => 0, - 'fq' => 0, - 'fq ' => 0, - 'fv ' => 0, - 'fw ' => 0, - 'fx' => 0, - 'fx ' => 0, - 'fz' => 0, - 'fz ' => 0, - 'g\' ' => 0, - 'gc ' => 0, - 'gf ' => 0, - 'gj ' => 0, - 'gk ' => 0, - 'gl ' => 0, - 'gq' => 0, - 'gq ' => 0, - 'gv' => 0, - 'gv ' => 0, - 'gw ' => 0, - 'gx' => 0, - 'gx ' => 0, - 'gz ' => 0, - 'hb ' => 0, - 'hc ' => 0, - 'hg ' => 0, - 'hh ' => 0, - 'hj' => 0, - 'hj ' => 0, - 'hk ' => 0, - 'hv' => 0, - 'hv ' => 0, - 'hw ' => 0, - 'hx' => 0, - 'hx ' => 0, - 'hz' => 0, - 'i\' ' => 0, - 'ih ' => 0, - 'iq ' => 0, - 'iw ' => 0, - 'j\'' => 0, - 'j\' ' => 0, - 'jb' => 0, - 'jb ' => 0, - 'jc' => 0, - 'jc ' => 0, - 'jd' => 0, - 'jf' => 0, - 'jg' => 0, - 'jg ' => 0, - 'jh' => 0, - 'jh ' => 0, - 'jj' => 0, - 'jj ' => 0, - 'jk ' => 0, - 'jl ' => 0, - 'jm' => 0, - 'jm ' => 0, - 'jn' => 0, - 'jn ' => 0, - 'jp ' => 0, - 'jq' => 0, - 'jq ' => 0, - 'jr' => 0, - 'jr ' => 0, - 'js' => 0, - 'js ' => 0, - 'jt' => 0, - 'ju ' => 0, - 'jv' => 0, - 'jv ' => 0, - 'jw' => 0, - 'jw ' => 0, - 'jx' => 0, - 'jx ' => 0, - 'jy' => 0, - 'jy ' => 0, - 'jz' => 0, - 'jz ' => 0, - 'kb ' => 0, - 'kc ' => 0, - 'kd ' => 0, - 'kj ' => 0, - 'km ' => 0, - 'kp ' => 0, - 'kq' => 0, - 'kq ' => 0, - 'kv' => 0, - 'kv ' => 0, - 'kx' => 0, - 'kx ' => 0, - 'kz' => 0, - 'kz ' => 0, - 'lg ' => 0, - 'lh ' => 0, - 'lj ' => 0, - 'lq ' => 0, - 'lr ' => 0, - 'lv ' => 0, - 'lw ' => 0, - 'lx' => 0, - 'lz ' => 0, - 'm\' ' => 0, - 'mg ' => 0, - 'mh ' => 0, - 'mj ' => 0, - 'mk ' => 0, - 'mq' => 0, - 'mq ' => 0, - 'mx' => 0, - 'mx ' => 0, - 'mz' => 0, - 'nb ' => 0, - 'nm ' => 0, - 'pj ' => 0, - 'pk ' => 0, - 'pq' => 0, - 'pq ' => 0, - 'pv' => 0, - 'pw ' => 0, - 'px' => 0, - 'px ' => 0, - 'pz ' => 0, - 'q\'' => 0, - 'q\' ' => 0, - 'qa ' => 0, - 'qb' => 0, - 'qb ' => 0, - 'qc' => 0, - 'qc ' => 0, - 'qd' => 0, - 'qd ' => 0, - 'qe' => 0, - 'qe ' => 0, - 'qf' => 0, - 'qf ' => 0, - 'qg' => 0, - 'qg ' => 0, - 'qh' => 0, - 'qh ' => 0, - 'qi' => 0, - 'qj' => 0, - 'qj ' => 0, - 'qk' => 0, - 'qk ' => 0, - 'ql' => 0, - 'ql ' => 0, - 'qm' => 0, - 'qm ' => 0, - 'qn' => 0, - 'qn ' => 0, - 'qo' => 0, - 'qo ' => 0, - 'qp' => 0, - 'qp ' => 0, - 'qq' => 0, - 'qq ' => 0, - 'qr' => 0, - 'qs' => 0, - 'qs ' => 0, - 'qt' => 0, - 'qt ' => 0, - 'qu ' => 0, - 'qv' => 0, - 'qv ' => 0, - 'qw' => 0, - 'qw ' => 0, - 'qx' => 0, - 'qx ' => 0, - 'qy' => 0, - 'qy ' => 0, - 'qz' => 0, - 'qz ' => 0, - 'rq ' => 0, - 'rz ' => 0, - 'sg ' => 0, - 'sj ' => 0, - 'sx' => 0, - 'sx ' => 0, - 'sz' => 0, - 'sz ' => 0, - 'tg ' => 0, - 'tj ' => 0, - 'tq' => 0, - 'tq ' => 0, - 'tx' => 0, - 'tx ' => 0, - 'uj ' => 0, - 'uq ' => 0, - 'uu ' => 0, - 'uw ' => 0, - 'v\' ' => 0, - 'vb' => 0, - 'vb ' => 0, - 'vc' => 0, - 'vf' => 0, - 'vf ' => 0, - 'vg' => 0, - 'vh' => 0, - 'vh ' => 0, - 'vj' => 0, - 'vj ' => 0, - 'vk' => 0, - 'vk ' => 0, - 'vl ' => 0, - 'vm' => 0, - 'vn ' => 0, - 'vp' => 0, - 'vp ' => 0, - 'vq' => 0, - 'vq ' => 0, - 'vr ' => 0, - 'vv ' => 0, - 'vw' => 0, - 'vw ' => 0, - 'vx' => 0, - 'vz' => 0, - 'vz ' => 0, - 'w\' ' => 0, - 'wb ' => 0, - 'wc ' => 0, - 'wf ' => 0, - 'wg ' => 0, - 'wj' => 0, - 'wj ' => 0, - 'wq' => 0, - 'wq ' => 0, - 'wr ' => 0, - 'wv' => 0, - 'wv ' => 0, - 'wx' => 0, - 'wz ' => 0, - 'x\'' => 0, - 'x\' ' => 0, - 'xa ' => 0, - 'xb ' => 0, - 'xc ' => 0, - 'xd' => 0, - 'xd ' => 0, - 'xf ' => 0, - 'xg ' => 0, - 'xh ' => 0, - 'xj' => 0, - 'xj ' => 0, - 'xk' => 0, - 'xk ' => 0, - 'xl ' => 0, - 'xm ' => 0, - 'xn' => 0, - 'xn ' => 0, - 'xp ' => 0, - 'xq ' => 0, - 'xr' => 0, - 'xr ' => 0, - 'xs ' => 0, - 'xu ' => 0, - 'xv' => 0, - 'xv ' => 0, - 'xw ' => 0, - 'xx' => 0, - 'xz' => 0, - 'xz ' => 0, - 'yb ' => 0, - 'yc ' => 0, - 'yd ' => 0, - 'yf ' => 0, - 'yg ' => 0, - 'yh ' => 0, - 'yj ' => 0, - 'yq' => 0, - 'yq ' => 0, - 'yu ' => 0, - 'yv ' => 0, - 'yw ' => 0, - 'yy' => 0, - 'yy ' => 0, - 'yz ' => 0, - 'z\'' => 0, - 'z\' ' => 0, - 'zb ' => 0, - 'zc' => 0, - 'zc ' => 0, - 'zd' => 0, - 'zd ' => 0, - 'zf' => 0, - 'zf ' => 0, - 'zg ' => 0, - 'zh' => 0, - 'zh ' => 0, - 'zj' => 0, - 'zj ' => 0, - 'zk ' => 0, - 'zl ' => 0, - 'zn' => 0, - 'zn ' => 0, - 'zp ' => 0, - 'zq' => 0, - 'zq ' => 0, - 'zr' => 0, - 'zr ' => 0, - 'zs' => 0, - 'zs ' => 0, - 'zt' => 0, - 'zt ' => 0, - 'zu ' => 0, - 'zv ' => 0, - 'zw ' => 0, - 'zx' => 0, - 'zx ' => 0, - ); - - /** - * - * @param array|null $words_exceptions - */ - public function __construct(array $words_exceptions = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - #русский --> английский: - $this->en_correct = '/(?: (?:' . $this->tt_f . ') - (?: (?:' . $this->en_uniq . ') | (?:' . $this->en_sc . '){2} ) - | (?:' . $this->en_sc . ') - (?:' . $this->tt_f . ') - (?:' . $this->en_sc . ') - | (?: (?:' . $this->en_uniq . ') | (?:' . $this->en_sc . '){2} ) - (?:' . $this->tt_f . ') - ) - /sxSX'; - #английский --> русский: - $this->tt_correct = '/(?: (?:' . $this->en_sc . ') - (?: (?:' . $this->tt_uniq . ') | (?:' . $this->tt_f . '){2} ) - | (?:' . $this->tt_f . ') - (?:' . $this->en_sc . ') - (?:' . $this->tt_f . ') - | (?: (?:' . $this->tt_uniq . ') | (?:' . $this->tt_f . '){2} ) - (?:' . $this->en_sc . ') - ) - /sxSX'; - $this->table_flip = array( - 0 => array_flip($this->table[0]), - 1 => array_flip($this->table[1]), - ); - if (is_array($words_exceptions)) { - $this->words_exceptions += $words_exceptions; - } - } - - /** - * Исправляет клавиатурные опечатки в тексте. - * - * @param scalar|null $s Текст в кодировке UTF-8. - * @param int $mode Константы self::SIMILAR_CHARS и/или self::KEYBOARD_LAYOUT, - * (их можно комбинировать). Описание констант см. выше. - * При использовании self::KEYBOARD_LAYOUT время работы увеличивается примерно в 10 раз. - * @param array &$words Ассоц. массив со словами, которые были исправлены: - * в ключах оригиналы, в значениях исправленные слова. - * @return string|bool Returns FALSE if error occured - */ - public function parse($s, $mode = self::SIMILAR_CHARS, array &$words = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (!is_string($s)) { - return $s; - } - - if ($mode < self::SIMILAR_CHARS || $mode > (self::SIMILAR_CHARS | self::KEYBOARD_LAYOUT | self::ADD_FIX)) { - trigger_error('Unknown mode', E_USER_WARNING); - return false; - } - - $this->mode = $mode; - - #вырезаем и заменяем некоторые символы - $additional_chars = array( - "\xc2\xad", #"мягкие" переносы строк (­) - ); - #http://ru.wikipedia.org/wiki/Диакритические_знаки - $s = UTF8::diactrical_remove($s, $additional_chars, $is_can_restored = true, $restore_table); - - $this->words = array(); - $s = $this->_parse1($s); - $s = $this->_parse2($s); - $s = UTF8::diactrical_restore($s, $restore_table); - $words = $this->words; - return $s; - } - - private function _parse1($s) - { - #заменяем слова из текста, минимальная длина -- 3 символа, меньше нельзя - return preg_replace_callback('/(?> (' . $this->en . ') #1 латинские буквы - | (' . $this->tt . ') #2 русские буквы - | (' . $this->sc . ') #3 символы, которые м.б. набраны по ошибке в английской раскладке клавиатуры вместо русских букв - ){3,}+ - /sxSX', array($this, '_word'), $s); - } - - private function _parse2($s) - { - #исправляем русские буквы (похожие на латинские) с рядом стоящими цифрами на латинские - #например, это м. б. каталожные номера автозапчастей, в которых есть русские буквы: 1500A023, 52511-60900-H0, K2305, XA527672 - #корректно обрабатываем вхождения '1-ое', 'Ту-134', 'А19-3107/06-43-Ф02-4227/06-С1' - if (version_compare(PHP_VERSION, '5.2.0', '<')) { - return $s; - } - return preg_replace_callback('~(?: (?<=[^-_/]|^) - (?:' . $this->ru_similar . ')++ - (?= (?:' . $this->en . '|[-_/])*+ (?<=[^-_/]|' . $this->en . '[-_/]) - \d [\d-_/]*+ (?!' . $this->tt_uniq . ') - ) - | (?<=[^-_/]|^) - \d (?:' . $this->en . '|[-_/])*+ (?<=[^-_/]|' . $this->en . '[-_/]) - \K - (?:' . $this->ru_similar . ')++ - (?= [\d-_/]*+ (?!' . $this->tt_uniq . ') ) - ) - ~sxSX', array($this, '_entry'), $s); - } - - private function _entry(array &$a) - { - $entry =& $a[0]; - $s = strtr($entry, $this->table[0]); - if ($s !== $entry) { - $this->words[$entry] = $s; - } - return $s; - } - - private function _word(array &$a) - { - $word = $a[0]; - #var_export($a); - - $suggestions = array(); - - #если найдено слово из мешанины русских и латинских букв - if (!empty($a[1]) && !empty($a[2])) { - if (($this->mode & self::SIMILAR_CHARS) === 0) { - return $word; - } - #ВНИМАНИЕ! порядок следования правил преобразования имеет значение! - - /* - Исправляем ошибочно набранные буквы, которые выглядят одинаково - в инициалах перед фамилиями (русский <--> английский), например: Т.С.Навка - */ - - #0a. английский --> русский: - if ($word[1] === '.' #оптимизация - && preg_match('/^ ( ' . $this->en_similar_uc . '\. #первый инициал - (?:' . $this->en_similar_uc . '\.)? #второй инициал (необязательно) - ) #1 инициалы - (' . $this->no_sc . '{2,}+) #2 фамилия (английские и русские буквы) - $/sxSX', $word, $m) - ) { - $m[2] = $this->_parse1($m[2]); - #фамилия по-русски? - if (preg_match('/^ (?:' . $this->tt_uc . ') #первая буква д.б. большая - (?:' . $this->tt_f . ')+ #минимальное кол-во букв в фамилии = 2 - $/sxSX', $m[2])) { - return strtr($m[1], $this->table_flip[0]) . $m[2]; - } - } - - #0b. русский --> английский: - if ($word[2] === '.' #оптимизация - && preg_match('/^ ( ' . $this->ru_similar_uc . '\. #первый инициал - (?:' . $this->ru_similar_uc . '\.)? #второй инициал (необязательно) - ) #1 инициалы - (' . $this->no_sc . '{2,}+) #2 фамилия (английские и русские буквы) - $/sxSX', $word, $m) - ) { - $m[2] = $this->_parse1($m[2]); - #фамилия по-англ.? - if (preg_match('/^ ' . $this->en_uc . ' #первая буква д.б. большая - ' . $this->en . '++ #минимальное кол-во букв в фамилии = 2 - $/sxSX', $m[2])) { - return strtr($m[1], $this->table[0]) . $m[2]; - } - } - - #1. английский --> русский: - $this->method = 0; #буквы, которые выглядят одинаково - $this->is_flip = true; - $s = $this->_replace($word, $this->tt_correct); - if ($word !== $s && !$this->_is_mixed($s)) { - $suggestions['tt0'] = $s; - } - - #2. английский --> русский: - $this->method = 1; #буквы в другой раскладке клавиатуры - $this->is_flip = true; - $s = $this->_replace($word, $this->tt_correct); - if ($word !== $s) { - $suggestions['tt1'] = $s; - } - - #3. русский --> английский: - $this->method = 0; #буквы, которые выглядят одинаково - $this->is_flip = false; - $s = $this->_replace($word, $this->en_correct); - if ($word !== $s && !$this->_is_mixed($s)) { - $suggestions['en0'] = $s; - } - - #4. русский --> английский: - $this->method = 1; #буквы в другой раскладке клавиатуры - $this->is_flip = false; - $s = $this->_replace($word, $this->en_correct); - if ($word !== $s) { - $suggestions['en1'] = $s; - } - } #если найдено слово только из латинских букв; минимальная длина -- 4 буквы! - elseif (!empty($a[1]) && strlen($word) >= 4) { - if (($this->mode & self::KEYBOARD_LAYOUT) === 0) { - return $word; - } - - #не обрабатываем аббревиатуры, пример: AMPAS - if (preg_match('/^(?:' . $this->en_uc . '|' . $this->sc . '){1,6}+$/sxSX', $word)) { - return $word; - } - - #английский --> русский: - $suggestions['en1'] = $word; - $suggestions['tt1'] = strtr($word, $this->table_flip[1]); - } #если найдено слово только из русских букв; минимальная длина -- 4 буквы! - elseif (!empty($a[2]) && strlen($word) >= 8) { - if (($this->mode & self::KEYBOARD_LAYOUT) === 0) { - return $word; - } - - #не обрабатываем аббревиатуры, пример: ДОСААФ - if (preg_match('/^(?:' . $this->tt_uc . '|' . $this->sc . '){1,6}+$/sxSX', $word)) { - return $word; - } - - #русский --> английский: - $suggestions['tt1'] = $word; - $suggestions['en1'] = strtr($word, $this->table[1]); - } #найдены спецсимволы или длина слова слишком мала - else { - return $word; - } - - $suggestions = array_unique($suggestions); - #var_export($suggestions); - - $c = count($suggestions); - if ($c === 0) { - $s = $word; - } else { - $s = $this->_detect($word, $suggestions, !empty($a[3])); - } - if ($s !== $word) { - $this->words[$word] = $s; - if ($this->mode >= (self::KEYBOARD_LAYOUT | self::ADD_FIX)) { - $s = '(' . $word . '=>' . $s . ')'; - } - } - return $s; - } - - private function _replace($word, $regexp) - { - do { - $word = preg_replace_callback($regexp, array(&$this, '_strtr'), $w = $word); - } while ($w !== $word); - return $word; - } - - private function _strtr(array $a) - { - $word =& $a[0]; - return strtr($word, $this->is_flip ? $this->table_flip[$this->method] : $this->table[$this->method]); - } - - private function _is_mixed($word) - { - return preg_match('/(?:' . $this->en . ')/sxSX', $word) && - preg_match('/(?:' . $this->tt_f . ')/sxSX', $word); - } - - #выбираем из нескольких вариантов один - private function _detect($word, array $suggestions, $is_sc) - { - if (0) { - #DEBUG - - //$suggestions['?'] = $word; - var_export($suggestions); - } - #не д. б. несуществующих N-грамм - foreach ($suggestions as $type => $w) { - $lang = substr($type, 0, 2); - if ($this->_bigram_exists($w, $lang)) { - unset($suggestions[$type]); - } - } - if (0) { - #DEBUG - - //$suggestions['?'] = $word; - var_export($suggestions); - } - if (count($suggestions) === 0) { - return $word; - } - - $s = end($suggestions); - - #если в $word были спецсимволы, а в $s их уже нет, возвращаем $s - if ($is_sc && !preg_match('/' . $this->sc . '/sSX', $s)) { - return $s; - } - - #если в $s спецсимволов больше чем букв, возвращаем $word - $sc_count = 0; - $s = preg_replace('/' . $this->sc . '/sSX', '', $s, -1, $sc_count); - if ($sc_count > 0 && $sc_count > UTF8::strlen($s)) { - return $word; - } - - return reset($suggestions); - } - - #анализ на основе N-грамм русского и английского языка - private function _bigram_exists($word, $lang) - { - $word = ($lang === 'en') ? strtolower($word) : UTF8::lowercase($word); - - #шаг 0. - #проверяем слова в списке слов-исключений - if (array_key_exists($word, $this->words_exceptions[$lang])) { - return false; - } - - #шаг 1. - #проверка на 4 согласные буквы подряд; пример: больши{нств}о, юрисконсу{льтс}тво - if (preg_match('/(?:' . $this->consonant_lc[$lang] . '){4}/sxSX', $word, $m) - #проверяем список исключений - && !array_key_exists($m[0], $this->consonants4_lc[$lang]) - ) { - return true; - } - - #шаг 2. - #проверка на 3 гласные буквы подряд; пример: длиннош{еее}, зм{еео}бразный - if (preg_match('/(?:' . $this->vowel_lc[$lang] . '){3}/sxSX', $word, $m) - #проверяем список исключений - && !array_key_exists($m[0], $this->vowels3_lc[$lang]) - ) { - return true; - } - - #шаг 3. - $length = UTF8::strlen($word); - for ($pos = 0, $limit = $length - 1; $pos < $limit; $pos++) { - /* - TODO Качество проверки по несуществующим биграммам можно немного повысить, - если учитывать не только начало и конец слова, но и все позиции биграмм в слове. - */ - $ss = UTF8::substr($word, $pos, 2); - if ($pos === 0) { - $ss = ' ' . $ss; - } #beginning of word - elseif ($pos === $limit - 1) { - $ss .= ' '; - } #ending of word - if (array_key_exists($ss, $this->bigrams)) { - return true; - } - } - - return false; - } -} diff --git a/library/includes/classes/reflection.php b/library/includes/classes/reflection.php deleted file mode 100644 index 364c87ec6..000000000 --- a/library/includes/classes/reflection.php +++ /dev/null @@ -1,225 +0,0 @@ - 'is_int', - 'integer' => 'is_int', - 'digit' => 'ctype_digit', - 'number' => 'ctype_digit', - 'float' => 'is_float', - 'double' => 'is_float', - 'real' => 'is_float', - 'numeric' => 'is_numeric', - 'str' => 'is_string', - 'string' => 'is_string', - 'char' => 'is_string', - 'bool' => 'is_bool', - 'boolean' => 'is_bool', - 'null' => 'is_null', - 'array' => 'is_array', - 'obj' => 'is_object', - 'object' => 'is_object', - 'res' => 'is_resource', - 'resource' => 'is_resource', - 'scalar' => 'is_scalar', #integer, float, string or boolean - 'cb' => 'is_callable', - 'callback' => 'is_callable', - ); - - #calling the methods of this class only statically! - private function __construct() - { - } - - public static function isValid() - { - if (!assert_options(ASSERT_ACTIVE)) { - return true; - } - $bt = self::debugBacktrace(null, 1); - extract($bt); //to $file, $line, $function, $class, $object, $type, $args - if (!$args) { - return true; - } #speed improve - $r = new ReflectionMethod($class, $function); - $doc = $r->getDocComment(); - $cache_id = $class . $type . $function; - preg_match_all('~ [\r\n]++ [\x20\t]++ \* [\x20\t]++ - @param - [\x20\t]++ - \K #memory reduce - ( [_a-z]++[_a-z\d]*+ - (?>[|/,][_a-z]+[_a-z\d]*)*+ - ) #1 types - [\x20\t]++ - &?+\$([_a-z]++[_a-z\d]*+) #2 name - ~sixSX', $doc, $params, PREG_SET_ORDER); - $parameters = $r->getParameters(); - //d($args, $params, $parameters); - if (count($parameters) > count($params)) { - $message = 'phpDoc %d piece(s) @param description expected in %s%s%s(), %s given, ' . PHP_EOL - . 'called in %s on line %d ' . PHP_EOL - . 'and defined in %s on line %d'; - $message = sprintf($message, count($parameters), $class, $type, $function, count($params), $file, $line, $r->getFileName(), $r->getStartLine()); - trigger_error($message, E_USER_NOTICE); - } - foreach ($args as $i => $value) { - if (!isset($params[$i])) { - return true; - } - if ($parameters[$i]->name !== $params[$i][2]) { - $param_num = $i + 1; - $message = 'phpDoc @param %d in %s%s%s() must be named as $%s, $%s given, ' . PHP_EOL - . 'called in %s on line %d ' . PHP_EOL - . 'and defined in %s on line %d'; - $message = sprintf($message, $param_num, $class, $type, $function, $parameters[$i]->name, $params[$i][2], $file, $line, $r->getFileName(), $r->getStartLine()); - trigger_error($message, E_USER_NOTICE); - } - - $hints = preg_split('~[|/,]~sSX', $params[$i][1]); - if (!self::checkValueTypes($hints, $value)) { - $param_num = $i + 1; - $message = 'Argument %d passed to %s%s%s() must be an %s, %s given, ' . PHP_EOL - . 'called in %s on line %d ' . PHP_EOL - . 'and defined in %s on line %d'; - $message = sprintf($message, $param_num, $class, $type, $function, implode('|', $hints), (is_object($value) ? get_class($value) . ' ' : '') . gettype($value), $file, $line, $r->getFileName(), $r->getStartLine()); - trigger_error($message, E_USER_WARNING); - return false; - } - } - return true; - } - - /** - * Return stacktrace. Correctly work with call_user_func*() - * (totally skip them correcting caller references). - * If $return_frame is present, return only $return_frame matched caller, not all stacktrace. - * - * @param string|null $re_ignore example: '~^' . preg_quote(__CLASS__, '~') . '(?![a-zA-Z\d])~sSX' - * @param int|null $return_frame - * @return array - */ - public static function debugBacktrace($re_ignore = null, $return_frame = null) - { - $trace = debug_backtrace(); - - $a = array(); - $frames = 0; - for ($i = 0, $n = count($trace); $i < $n; $i++) { - $t = $trace[$i]; - if (!$t) { - continue; - } - - // Next frame. - $next = isset($trace[$i + 1]) ? $trace[$i + 1] : null; - - // Dummy frame before call_user_func*() frames. - if (!isset($t['file']) && $next) { - $t['over_function'] = $trace[$i + 1]['function']; - $t += $trace[$i + 1]; - $trace[$i + 1] = null; // skip call_user_func on next iteration - } - - // Skip myself frame. - if (++$frames < 2) { - continue; - } - - // 'class' and 'function' field of next frame define where this frame function situated. - // Skip frames for functions situated in ignored places. - if ($re_ignore && $next) { - // Name of function "inside which" frame was generated. - $frame_caller = (isset($next['class']) ? $next['class'] . $next['type'] : '') - . (isset($next['function']) ? $next['function'] : ''); - if (preg_match($re_ignore, $frame_caller)) { - continue; - } - } - - // On each iteration we consider ability to add PREVIOUS frame to $a stack. - if (count($a) === $return_frame) { - return $t; - } - $a[] = $t; - } - return $a; - } - - /** - * Checks a value to the allowed types - * - * @param array $types - * @param mixed $value - * @return bool - */ - public static function checkValueTypes(array $types, $value) - { - foreach ($types as $type) { - $type = strtolower($type); - if (array_key_exists($type, self::$hints) && call_user_func(self::$hints[$type], $value)) { - return true; - } - if (is_object($value) && @is_a($value, $type)) { - return true; - } - if ($type === 'mixed') { - return true; - } - } - return false; - } -} diff --git a/library/includes/classes/utf8.php b/library/includes/classes/utf8.php deleted file mode 100644 index ac7ab98e6..000000000 --- a/library/includes/classes/utf8.php +++ /dev/null @@ -1,4513 +0,0 @@ - = 5.3.x - * - * In Russian: - * - * Поддержка UTF-8 в PHP 5. - * - * Возможности и преимущества использования этого класса - * * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками - * * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются! - * * Полезные функции, отсутствующие в ICONV и MBSTRING - * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null (удобно при выборках значений из базы данных) - * * Несколько методов умеют обрабатывать массивы рекурсивно - * * Единый интерфейс и инкапсуляция (можно унаследоваться и переопределить методы) - * * Высокая производительность, надёжность и качественный код - * * PHP >= 5.3.x - * - * Example: - * $s = 'Hello, Привет'; - * if (UTF8::is_utf8($s)) echo UTF8::strlen($s); - * - * UTF-8 encoding scheme: - * 2^7 0x00000000 — 0x0000007F 0xxxxxxx - * 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx - * 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx - * 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864 - * - * If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-) - * - * Useful links - * http://ru.wikipedia.org/wiki/UTF8 - * http://www.madore.org/~david/misc/unitest/ A Unicode Test Page - * http://www.unicode.org/ - * http://www.unicode.org/reports/ - * http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm - * http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm - * http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode - * http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search - * - * @link http://code.google.com/p/php5-utf8/ - * @license http://creativecommons.org/licenses/by-sa/3.0/ - * @author Nasibullin Rinat - * @version 2.2.2 - */ -class utf8 -{ - #REPLACEMENT CHARACTER (for broken char) - const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD - - /** - * Regular expression for a character in UTF-8 without the use of a flag /u - * @deprecated Instead, use a dot (".") and the flag /u, it works faster! - * @var string - */ - public static $char_re = ' [\x09\x0A\x0D\x20-\x7E] # ASCII strict - # [\x00-\x7F] # ASCII non-strict (including control chars) - | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte - | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs - | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte - | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates - | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 - | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 - | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 - '; - - /** - * Combining diactrical marks (Unicode 5.1). - * - * For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419), - * decomposed form: (U+0415 U+0308), (U+0418 U+0306) - * - * @link http://www.unicode.org/charts/PDF/U0300.pdf - * @link http://www.unicode.org/charts/PDF/U1DC0.pdf - * @link http://www.unicode.org/charts/PDF/UFE20.pdf - * @var string - */ - #public static $diactrical_re = '\p{M}'; #alternative, but only with /u flag - public static $diactrical_re = ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters) - | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols) - | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement) - | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks) - '; - - /** - * @var array - */ - public static $html_special_chars_table = array( - '"' => "\x22", #U+0022 ["] " quotation mark = APL quote - '&' => "\x26", #U+0026 [&] & ampersand - '<' => "\x3c", #U+003C [<] < less-than sign - '>' => "\x3e", #U+003E [>] > greater-than sign - ); - - /** - * @link http://www.fileformat.info/format/w3c/entitytest.htm?sort=Unicode%20Character HTML Entity Browser Test Page - * @var array - */ - public static $html_entity_table = array( - #Latin-1 Entities: - ' ' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space - '¡' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark - '¢' => "\xc2\xa2", #U+00A2 [¢] cent sign - '£' => "\xc2\xa3", #U+00A3 [£] pound sign - '¤' => "\xc2\xa4", #U+00A4 [¤] currency sign - '¥' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign - '¦' => "\xc2\xa6", #U+00A6 [¦] broken bar = broken vertical bar - '§' => "\xc2\xa7", #U+00A7 [§] section sign - '¨' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis - '©' => "\xc2\xa9", #U+00A9 [©] copyright sign - 'ª' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator - '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet - '¬' => "\xc2\xac", #U+00AC [¬] not sign - '­' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen - '®' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign - '¯' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar - '°' => "\xc2\xb0", #U+00B0 [°] degree sign - '±' => "\xc2\xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign - '²' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared - '³' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed - '´' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute - 'µ' => "\xc2\xb5", #U+00B5 [µ] micro sign - '¶' => "\xc2\xb6", #U+00B6 [¶] pilcrow sign = paragraph sign - '·' => "\xc2\xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot - '¸' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla - '¹' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one - 'º' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator - '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet - '¼' => "\xc2\xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter - '½' => "\xc2\xbd", #U+00BD [½] vulgar fraction one half = fraction one half - '¾' => "\xc2\xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters - '¿' => "\xc2\xbf", #U+00BF [¿] inverted question mark = turned question mark - #Latin capital letter - 'À' => "\xc3\x80", #Latin capital letter A with grave = Latin capital letter A grave - 'Á' => "\xc3\x81", #Latin capital letter A with acute - 'Â' => "\xc3\x82", #Latin capital letter A with circumflex - 'Ã' => "\xc3\x83", #Latin capital letter A with tilde - 'Ä' => "\xc3\x84", #Latin capital letter A with diaeresis - 'Å' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring - 'Æ' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE - 'Ç' => "\xc3\x87", #Latin capital letter C with cedilla - 'È' => "\xc3\x88", #Latin capital letter E with grave - 'É' => "\xc3\x89", #Latin capital letter E with acute - 'Ê' => "\xc3\x8a", #Latin capital letter E with circumflex - 'Ë' => "\xc3\x8b", #Latin capital letter E with diaeresis - 'Ì' => "\xc3\x8c", #Latin capital letter I with grave - 'Í' => "\xc3\x8d", #Latin capital letter I with acute - 'Î' => "\xc3\x8e", #Latin capital letter I with circumflex - 'Ï' => "\xc3\x8f", #Latin capital letter I with diaeresis - 'Ð' => "\xc3\x90", #Latin capital letter ETH - 'Ñ' => "\xc3\x91", #Latin capital letter N with tilde - 'Ò' => "\xc3\x92", #Latin capital letter O with grave - 'Ó' => "\xc3\x93", #Latin capital letter O with acute - 'Ô' => "\xc3\x94", #Latin capital letter O with circumflex - 'Õ' => "\xc3\x95", #Latin capital letter O with tilde - 'Ö' => "\xc3\x96", #Latin capital letter O with diaeresis - '×' => "\xc3\x97", #U+00D7 [×] multiplication sign - 'Ø' => "\xc3\x98", #Latin capital letter O with stroke = Latin capital letter O slash - 'Ù' => "\xc3\x99", #Latin capital letter U with grave - 'Ú' => "\xc3\x9a", #Latin capital letter U with acute - 'Û' => "\xc3\x9b", #Latin capital letter U with circumflex - 'Ü' => "\xc3\x9c", #Latin capital letter U with diaeresis - 'Ý' => "\xc3\x9d", #Latin capital letter Y with acute - 'Þ' => "\xc3\x9e", #Latin capital letter THORN - #Latin small letter - 'ß' => "\xc3\x9f", #Latin small letter sharp s = ess-zed - 'à' => "\xc3\xa0", #Latin small letter a with grave = Latin small letter a grave - 'á' => "\xc3\xa1", #Latin small letter a with acute - 'â' => "\xc3\xa2", #Latin small letter a with circumflex - 'ã' => "\xc3\xa3", #Latin small letter a with tilde - 'ä' => "\xc3\xa4", #Latin small letter a with diaeresis - 'å' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring - 'æ' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae - 'ç' => "\xc3\xa7", #Latin small letter c with cedilla - 'è' => "\xc3\xa8", #Latin small letter e with grave - 'é' => "\xc3\xa9", #Latin small letter e with acute - 'ê' => "\xc3\xaa", #Latin small letter e with circumflex - 'ë' => "\xc3\xab", #Latin small letter e with diaeresis - 'ì' => "\xc3\xac", #Latin small letter i with grave - 'í' => "\xc3\xad", #Latin small letter i with acute - 'î' => "\xc3\xae", #Latin small letter i with circumflex - 'ï' => "\xc3\xaf", #Latin small letter i with diaeresis - 'ð' => "\xc3\xb0", #Latin small letter eth - 'ñ' => "\xc3\xb1", #Latin small letter n with tilde - 'ò' => "\xc3\xb2", #Latin small letter o with grave - 'ó' => "\xc3\xb3", #Latin small letter o with acute - 'ô' => "\xc3\xb4", #Latin small letter o with circumflex - 'õ' => "\xc3\xb5", #Latin small letter o with tilde - 'ö' => "\xc3\xb6", #Latin small letter o with diaeresis - '÷' => "\xc3\xb7", #U+00F7 [÷] division sign - 'ø' => "\xc3\xb8", #Latin small letter o with stroke = Latin small letter o slash - 'ù' => "\xc3\xb9", #Latin small letter u with grave - 'ú' => "\xc3\xba", #Latin small letter u with acute - 'û' => "\xc3\xbb", #Latin small letter u with circumflex - 'ü' => "\xc3\xbc", #Latin small letter u with diaeresis - 'ý' => "\xc3\xbd", #Latin small letter y with acute - 'þ' => "\xc3\xbe", #Latin small letter thorn - 'ÿ' => "\xc3\xbf", #Latin small letter y with diaeresis - #Symbols and Greek Letters: - 'ƒ' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin - 'Α' => "\xce\x91", #Greek capital letter alpha - 'Β' => "\xce\x92", #Greek capital letter beta - 'Γ' => "\xce\x93", #Greek capital letter gamma - 'Δ' => "\xce\x94", #Greek capital letter delta - 'Ε' => "\xce\x95", #Greek capital letter epsilon - 'Ζ' => "\xce\x96", #Greek capital letter zeta - 'Η' => "\xce\x97", #Greek capital letter eta - 'Θ' => "\xce\x98", #Greek capital letter theta - 'Ι' => "\xce\x99", #Greek capital letter iota - 'Κ' => "\xce\x9a", #Greek capital letter kappa - 'Λ' => "\xce\x9b", #Greek capital letter lambda - 'Μ' => "\xce\x9c", #Greek capital letter mu - 'Ν' => "\xce\x9d", #Greek capital letter nu - 'Ξ' => "\xce\x9e", #Greek capital letter xi - 'Ο' => "\xce\x9f", #Greek capital letter omicron - 'Π' => "\xce\xa0", #Greek capital letter pi - 'Ρ' => "\xce\xa1", #Greek capital letter rho - 'Σ' => "\xce\xa3", #Greek capital letter sigma - 'Τ' => "\xce\xa4", #Greek capital letter tau - 'Υ' => "\xce\xa5", #Greek capital letter upsilon - 'Φ' => "\xce\xa6", #Greek capital letter phi - 'Χ' => "\xce\xa7", #Greek capital letter chi - 'Ψ' => "\xce\xa8", #Greek capital letter psi - 'Ω' => "\xce\xa9", #Greek capital letter omega - 'α' => "\xce\xb1", #Greek small letter alpha - 'β' => "\xce\xb2", #Greek small letter beta - 'γ' => "\xce\xb3", #Greek small letter gamma - 'δ' => "\xce\xb4", #Greek small letter delta - 'ε' => "\xce\xb5", #Greek small letter epsilon - 'ζ' => "\xce\xb6", #Greek small letter zeta - 'η' => "\xce\xb7", #Greek small letter eta - 'θ' => "\xce\xb8", #Greek small letter theta - 'ι' => "\xce\xb9", #Greek small letter iota - 'κ' => "\xce\xba", #Greek small letter kappa - 'λ' => "\xce\xbb", #Greek small letter lambda - 'μ' => "\xce\xbc", #Greek small letter mu - 'ν' => "\xce\xbd", #Greek small letter nu - 'ξ' => "\xce\xbe", #Greek small letter xi - 'ο' => "\xce\xbf", #Greek small letter omicron - 'π' => "\xcf\x80", #Greek small letter pi - 'ρ' => "\xcf\x81", #Greek small letter rho - 'ς' => "\xcf\x82", #Greek small letter final sigma - 'σ' => "\xcf\x83", #Greek small letter sigma - 'τ' => "\xcf\x84", #Greek small letter tau - 'υ' => "\xcf\x85", #Greek small letter upsilon - 'φ' => "\xcf\x86", #Greek small letter phi - 'χ' => "\xcf\x87", #Greek small letter chi - 'ψ' => "\xcf\x88", #Greek small letter psi - 'ω' => "\xcf\x89", #Greek small letter omega - 'ϑ' => "\xcf\x91", #Greek small letter theta symbol - 'ϒ' => "\xcf\x92", #Greek upsilon with hook symbol - 'ϖ' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol - - '•' => "\xe2\x80\xa2", #U+2022 [•] bullet = black small circle - '…' => "\xe2\x80\xa6", #U+2026 […] horizontal ellipsis = three dot leader - '′' => "\xe2\x80\xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов) - '″' => "\xe2\x80\xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов). - '‾' => "\xe2\x80\xbe", #U+203E [‾] overline = spacing overscore - '⁄' => "\xe2\x81\x84", #U+2044 [⁄] fraction slash - '℘' => "\xe2\x84\x98", #U+2118 [℘] script capital P = power set = Weierstrass p - 'ℑ' => "\xe2\x84\x91", #U+2111 [ℑ] blackletter capital I = imaginary part - 'ℜ' => "\xe2\x84\x9c", #U+211C [ℜ] blackletter capital R = real part symbol - '™' => "\xe2\x84\xa2", #U+2122 [™] trade mark sign - 'ℵ' => "\xe2\x84\xb5", #U+2135 [ℵ] alef symbol = first transfinite cardinal - '←' => "\xe2\x86\x90", #U+2190 [←] leftwards arrow - '↑' => "\xe2\x86\x91", #U+2191 [↑] upwards arrow - '→' => "\xe2\x86\x92", #U+2192 [→] rightwards arrow - '↓' => "\xe2\x86\x93", #U+2193 [↓] downwards arrow - '↔' => "\xe2\x86\x94", #U+2194 [↔] left right arrow - '↵' => "\xe2\x86\xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return - '⇐' => "\xe2\x87\x90", #U+21D0 [⇐] leftwards double arrow - '⇑' => "\xe2\x87\x91", #U+21D1 [⇑] upwards double arrow - '⇒' => "\xe2\x87\x92", #U+21D2 [⇒] rightwards double arrow - '⇓' => "\xe2\x87\x93", #U+21D3 [⇓] downwards double arrow - '⇔' => "\xe2\x87\x94", #U+21D4 [⇔] left right double arrow - '∀' => "\xe2\x88\x80", #U+2200 [∀] for all - '∂' => "\xe2\x88\x82", #U+2202 [∂] partial differential - '∃' => "\xe2\x88\x83", #U+2203 [∃] there exists - '∅' => "\xe2\x88\x85", #U+2205 [∅] empty set = null set = diameter - '∇' => "\xe2\x88\x87", #U+2207 [∇] nabla = backward difference - '∈' => "\xe2\x88\x88", #U+2208 [∈] element of - '∉' => "\xe2\x88\x89", #U+2209 [∉] not an element of - '∋' => "\xe2\x88\x8b", #U+220B [∋] contains as member - '∏' => "\xe2\x88\x8f", #U+220F [∏] n-ary product = product sign - '∑' => "\xe2\x88\x91", #U+2211 [∑] n-ary sumation - '−' => "\xe2\x88\x92", #U+2212 [−] minus sign - '∗' => "\xe2\x88\x97", #U+2217 [∗] asterisk operator - '√' => "\xe2\x88\x9a", #U+221A [√] square root = radical sign - '∝' => "\xe2\x88\x9d", #U+221D [∝] proportional to - '∞' => "\xe2\x88\x9e", #U+221E [∞] infinity - '∠' => "\xe2\x88\xa0", #U+2220 [∠] angle - '∧' => "\xe2\x88\xa7", #U+2227 [∧] logical and = wedge - '∨' => "\xe2\x88\xa8", #U+2228 [∨] logical or = vee - '∩' => "\xe2\x88\xa9", #U+2229 [∩] intersection = cap - '∪' => "\xe2\x88\xaa", #U+222A [∪] union = cup - '∫' => "\xe2\x88\xab", #U+222B [∫] integral - '∴' => "\xe2\x88\xb4", #U+2234 [∴] therefore - '∼' => "\xe2\x88\xbc", #U+223C [∼] tilde operator = varies with = similar to - '≅' => "\xe2\x89\x85", #U+2245 [≅] approximately equal to - '≈' => "\xe2\x89\x88", #U+2248 [≈] almost equal to = asymptotic to - '≠' => "\xe2\x89\xa0", #U+2260 [≠] not equal to - '≡' => "\xe2\x89\xa1", #U+2261 [≡] identical to - '≤' => "\xe2\x89\xa4", #U+2264 [≤] less-than or equal to - '≥' => "\xe2\x89\xa5", #U+2265 [≥] greater-than or equal to - '⊂' => "\xe2\x8a\x82", #U+2282 [⊂] subset of - '⊃' => "\xe2\x8a\x83", #U+2283 [⊃] superset of - '⊄' => "\xe2\x8a\x84", #U+2284 [⊄] not a subset of - '⊆' => "\xe2\x8a\x86", #U+2286 [⊆] subset of or equal to - '⊇' => "\xe2\x8a\x87", #U+2287 [⊇] superset of or equal to - '⊕' => "\xe2\x8a\x95", #U+2295 [⊕] circled plus = direct sum - '⊗' => "\xe2\x8a\x97", #U+2297 [⊗] circled times = vector product - '⊥' => "\xe2\x8a\xa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular - '⋅' => "\xe2\x8b\x85", #U+22C5 [⋅] dot operator - '⌈' => "\xe2\x8c\x88", #U+2308 [⌈] left ceiling = APL upstile - '⌉' => "\xe2\x8c\x89", #U+2309 [⌉] right ceiling - '⌊' => "\xe2\x8c\x8a", #U+230A [⌊] left floor = APL downstile - '⌋' => "\xe2\x8c\x8b", #U+230B [⌋] right floor - '⟨' => "\xe2\x8c\xa9", #U+2329 [〈] left-pointing angle bracket = bra - '⟩' => "\xe2\x8c\xaa", #U+232A [〉] right-pointing angle bracket = ket - '◊' => "\xe2\x97\x8a", #U+25CA [◊] lozenge - '♠' => "\xe2\x99\xa0", #U+2660 [♠] black spade suit - '♣' => "\xe2\x99\xa3", #U+2663 [♣] black club suit = shamrock - '♥' => "\xe2\x99\xa5", #U+2665 [♥] black heart suit = valentine - '♦' => "\xe2\x99\xa6", #U+2666 [♦] black diamond suit - #Other Special Characters: - 'Œ' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE - 'œ' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe - 'Š' => "\xc5\xa0", #U+0160 [Š] Latin capital letter S with caron - 'š' => "\xc5\xa1", #U+0161 [š] Latin small letter s with caron - 'Ÿ' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis - 'ˆ' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent - '˜' => "\xcb\x9c", #U+02DC [˜] small tilde - ' ' => "\xe2\x80\x82", #U+2002 [ ] en space - ' ' => "\xe2\x80\x83", #U+2003 [ ] em space - ' ' => "\xe2\x80\x89", #U+2009 [ ] thin space - '‌' => "\xe2\x80\x8c", #U+200C [‌] zero width non-joiner - '‍' => "\xe2\x80\x8d", #U+200D [‍] zero width joiner - '‎' => "\xe2\x80\x8e", #U+200E [‎] left-to-right mark - '‏' => "\xe2\x80\x8f", #U+200F [‏] right-to-left mark - '–' => "\xe2\x80\x93", #U+2013 [–] en dash - '—' => "\xe2\x80\x94", #U+2014 [—] em dash - '‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark - '’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!) - '‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark - '“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark - '”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark - '„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark - '†' => "\xe2\x80\xa0", #U+2020 [†] dagger - '‡' => "\xe2\x80\xa1", #U+2021 [‡] double dagger - '‰' => "\xe2\x80\xb0", #U+2030 [‰] per mille sign - '‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark - '›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark - '€' => "\xe2\x82\xac", #U+20AC [€] euro sign - ); - - /** - * This table contains the data on how cp1259 characters map into Unicode (UTF-8). - * The cp1259 map describes standart tatarish cyrillic charset and based on the cp1251 table. - * cp1259 -- this is an outdated one byte encoding of the Tatar language, - * which includes all the Russian letters from cp1251. - * - * @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz - * @link http://www.unicode.org/charts/PDF/U0400.pdf - */ - public static $cp1259_table = array( - #bytes from 0x00 to 0x7F (ASCII) saved as is - "\x80" => "\xd3\x98", #U+04d8 CYRILLIC CAPITAL LETTER SCHWA - "\x81" => "\xd0\x83", #U+0403 CYRILLIC CAPITAL LETTER GJE - "\x82" => "\xe2\x80\x9a", #U+201a SINGLE LOW-9 QUOTATION MARK - "\x83" => "\xd1\x93", #U+0453 CYRILLIC SMALL LETTER GJE - "\x84" => "\xe2\x80\x9e", #U+201e DOUBLE LOW-9 QUOTATION MARK - "\x85" => "\xe2\x80\xa6", #U+2026 HORIZONTAL ELLIPSIS - "\x86" => "\xe2\x80\xa0", #U+2020 DAGGER - "\x87" => "\xe2\x80\xa1", #U+2021 DOUBLE DAGGER - "\x88" => "\xe2\x82\xac", #U+20ac EURO SIGN - "\x89" => "\xe2\x80\xb0", #U+2030 PER MILLE SIGN - "\x8a" => "\xd3\xa8", #U+04e8 CYRILLIC CAPITAL LETTER BARRED O - "\x8b" => "\xe2\x80\xb9", #U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK - "\x8c" => "\xd2\xae", #U+04ae CYRILLIC CAPITAL LETTER STRAIGHT U - "\x8d" => "\xd2\x96", #U+0496 CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER - "\x8e" => "\xd2\xa2", #U+04a2 CYRILLIC CAPITAL LETTER EN WITH HOOK - "\x8f" => "\xd2\xba", #U+04ba CYRILLIC CAPITAL LETTER SHHA - "\x90" => "\xd3\x99", #U+04d9 CYRILLIC SMALL LETTER SCHWA - "\x91" => "\xe2\x80\x98", #U+2018 LEFT SINGLE QUOTATION MARK - "\x92" => "\xe2\x80\x99", #U+2019 RIGHT SINGLE QUOTATION MARK - "\x93" => "\xe2\x80\x9c", #U+201c LEFT DOUBLE QUOTATION MARK - "\x94" => "\xe2\x80\x9d", #U+201d RIGHT DOUBLE QUOTATION MARK - "\x95" => "\xe2\x80\xa2", #U+2022 BULLET - "\x96" => "\xe2\x80\x93", #U+2013 EN DASH - "\x97" => "\xe2\x80\x94", #U+2014 EM DASH - #"\x98" #UNDEFINED - "\x99" => "\xe2\x84\xa2", #U+2122 TRADE MARK SIGN - "\x9a" => "\xd3\xa9", #U+04e9 CYRILLIC SMALL LETTER BARRED O - "\x9b" => "\xe2\x80\xba", #U+203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - "\x9c" => "\xd2\xaf", #U+04af CYRILLIC SMALL LETTER STRAIGHT U - "\x9d" => "\xd2\x97", #U+0497 CYRILLIC SMALL LETTER ZHE WITH DESCENDER - "\x9e" => "\xd2\xa3", #U+04a3 CYRILLIC SMALL LETTER EN WITH HOOK - "\x9f" => "\xd2\xbb", #U+04bb CYRILLIC SMALL LETTER SHHA - "\xa0" => "\xc2\xa0", #U+00a0 NO-BREAK SPACE - "\xa1" => "\xd0\x8e", #U+040e CYRILLIC CAPITAL LETTER SHORT U - "\xa2" => "\xd1\x9e", #U+045e CYRILLIC SMALL LETTER SHORT U - "\xa3" => "\xd0\x88", #U+0408 CYRILLIC CAPITAL LETTER JE - "\xa4" => "\xc2\xa4", #U+00a4 CURRENCY SIGN - "\xa5" => "\xd2\x90", #U+0490 CYRILLIC CAPITAL LETTER GHE WITH UPTURN - "\xa6" => "\xc2\xa6", #U+00a6 BROKEN BAR - "\xa7" => "\xc2\xa7", #U+00a7 SECTION SIGN - "\xa8" => "\xd0\x81", #U+0401 CYRILLIC CAPITAL LETTER IO - "\xa9" => "\xc2\xa9", #U+00a9 COPYRIGHT SIGN - "\xaa" => "\xd0\x84", #U+0404 CYRILLIC CAPITAL LETTER UKRAINIAN IE - "\xab" => "\xc2\xab", #U+00ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - "\xac" => "\xc2\xac", #U+00ac NOT SIGN - "\xad" => "\xc2\xad", #U+00ad SOFT HYPHEN - "\xae" => "\xc2\xae", #U+00ae REGISTERED SIGN - "\xaf" => "\xd0\x87", #U+0407 CYRILLIC CAPITAL LETTER YI - "\xb0" => "\xc2\xb0", #U+00b0 DEGREE SIGN - "\xb1" => "\xc2\xb1", #U+00b1 PLUS-MINUS SIGN - "\xb2" => "\xd0\x86", #U+0406 CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I - "\xb3" => "\xd1\x96", #U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I - "\xb4" => "\xd2\x91", #U+0491 CYRILLIC SMALL LETTER GHE WITH UPTURN - "\xb5" => "\xc2\xb5", #U+00b5 MICRO SIGN - "\xb6" => "\xc2\xb6", #U+00b6 PILCROW SIGN - "\xb7" => "\xc2\xb7", #U+00b7 MIDDLE DOT - "\xb8" => "\xd1\x91", #U+0451 CYRILLIC SMALL LETTER IO - "\xb9" => "\xe2\x84\x96", #U+2116 NUMERO SIGN - "\xba" => "\xd1\x94", #U+0454 CYRILLIC SMALL LETTER UKRAINIAN IE - "\xbb" => "\xc2\xbb", #U+00bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - "\xbc" => "\xd1\x98", #U+0458 CYRILLIC SMALL LETTER JE - "\xbd" => "\xd0\x85", #U+0405 CYRILLIC CAPITAL LETTER DZE - "\xbe" => "\xd1\x95", #U+0455 CYRILLIC SMALL LETTER DZE - "\xbf" => "\xd1\x97", #U+0457 CYRILLIC SMALL LETTER YI - "\xc0" => "\xd0\x90", #U+0410 CYRILLIC CAPITAL LETTER A - "\xc1" => "\xd0\x91", #U+0411 CYRILLIC CAPITAL LETTER BE - "\xc2" => "\xd0\x92", #U+0412 CYRILLIC CAPITAL LETTER VE - "\xc3" => "\xd0\x93", #U+0413 CYRILLIC CAPITAL LETTER GHE - "\xc4" => "\xd0\x94", #U+0414 CYRILLIC CAPITAL LETTER DE - "\xc5" => "\xd0\x95", #U+0415 CYRILLIC CAPITAL LETTER IE - "\xc6" => "\xd0\x96", #U+0416 CYRILLIC CAPITAL LETTER ZHE - "\xc7" => "\xd0\x97", #U+0417 CYRILLIC CAPITAL LETTER ZE - "\xc8" => "\xd0\x98", #U+0418 CYRILLIC CAPITAL LETTER I - "\xc9" => "\xd0\x99", #U+0419 CYRILLIC CAPITAL LETTER SHORT I - "\xca" => "\xd0\x9a", #U+041a CYRILLIC CAPITAL LETTER KA - "\xcb" => "\xd0\x9b", #U+041b CYRILLIC CAPITAL LETTER EL - "\xcc" => "\xd0\x9c", #U+041c CYRILLIC CAPITAL LETTER EM - "\xcd" => "\xd0\x9d", #U+041d CYRILLIC CAPITAL LETTER EN - "\xce" => "\xd0\x9e", #U+041e CYRILLIC CAPITAL LETTER O - "\xcf" => "\xd0\x9f", #U+041f CYRILLIC CAPITAL LETTER PE - "\xd0" => "\xd0\xa0", #U+0420 CYRILLIC CAPITAL LETTER ER - "\xd1" => "\xd0\xa1", #U+0421 CYRILLIC CAPITAL LETTER ES - "\xd2" => "\xd0\xa2", #U+0422 CYRILLIC CAPITAL LETTER TE - "\xd3" => "\xd0\xa3", #U+0423 CYRILLIC CAPITAL LETTER U - "\xd4" => "\xd0\xa4", #U+0424 CYRILLIC CAPITAL LETTER EF - "\xd5" => "\xd0\xa5", #U+0425 CYRILLIC CAPITAL LETTER HA - "\xd6" => "\xd0\xa6", #U+0426 CYRILLIC CAPITAL LETTER TSE - "\xd7" => "\xd0\xa7", #U+0427 CYRILLIC CAPITAL LETTER CHE - "\xd8" => "\xd0\xa8", #U+0428 CYRILLIC CAPITAL LETTER SHA - "\xd9" => "\xd0\xa9", #U+0429 CYRILLIC CAPITAL LETTER SHCHA - "\xda" => "\xd0\xaa", #U+042a CYRILLIC CAPITAL LETTER HARD SIGN - "\xdb" => "\xd0\xab", #U+042b CYRILLIC CAPITAL LETTER YERU - "\xdc" => "\xd0\xac", #U+042c CYRILLIC CAPITAL LETTER SOFT SIGN - "\xdd" => "\xd0\xad", #U+042d CYRILLIC CAPITAL LETTER E - "\xde" => "\xd0\xae", #U+042e CYRILLIC CAPITAL LETTER YU - "\xdf" => "\xd0\xaf", #U+042f CYRILLIC CAPITAL LETTER YA - "\xe0" => "\xd0\xb0", #U+0430 CYRILLIC SMALL LETTER A - "\xe1" => "\xd0\xb1", #U+0431 CYRILLIC SMALL LETTER BE - "\xe2" => "\xd0\xb2", #U+0432 CYRILLIC SMALL LETTER VE - "\xe3" => "\xd0\xb3", #U+0433 CYRILLIC SMALL LETTER GHE - "\xe4" => "\xd0\xb4", #U+0434 CYRILLIC SMALL LETTER DE - "\xe5" => "\xd0\xb5", #U+0435 CYRILLIC SMALL LETTER IE - "\xe6" => "\xd0\xb6", #U+0436 CYRILLIC SMALL LETTER ZHE - "\xe7" => "\xd0\xb7", #U+0437 CYRILLIC SMALL LETTER ZE - "\xe8" => "\xd0\xb8", #U+0438 CYRILLIC SMALL LETTER I - "\xe9" => "\xd0\xb9", #U+0439 CYRILLIC SMALL LETTER SHORT I - "\xea" => "\xd0\xba", #U+043a CYRILLIC SMALL LETTER KA - "\xeb" => "\xd0\xbb", #U+043b CYRILLIC SMALL LETTER EL - "\xec" => "\xd0\xbc", #U+043c CYRILLIC SMALL LETTER EM - "\xed" => "\xd0\xbd", #U+043d CYRILLIC SMALL LETTER EN - "\xee" => "\xd0\xbe", #U+043e CYRILLIC SMALL LETTER O - "\xef" => "\xd0\xbf", #U+043f CYRILLIC SMALL LETTER PE - "\xf0" => "\xd1\x80", #U+0440 CYRILLIC SMALL LETTER ER - "\xf1" => "\xd1\x81", #U+0441 CYRILLIC SMALL LETTER ES - "\xf2" => "\xd1\x82", #U+0442 CYRILLIC SMALL LETTER TE - "\xf3" => "\xd1\x83", #U+0443 CYRILLIC SMALL LETTER U - "\xf4" => "\xd1\x84", #U+0444 CYRILLIC SMALL LETTER EF - "\xf5" => "\xd1\x85", #U+0445 CYRILLIC SMALL LETTER HA - "\xf6" => "\xd1\x86", #U+0446 CYRILLIC SMALL LETTER TSE - "\xf7" => "\xd1\x87", #U+0447 CYRILLIC SMALL LETTER CHE - "\xf8" => "\xd1\x88", #U+0448 CYRILLIC SMALL LETTER SHA - "\xf9" => "\xd1\x89", #U+0449 CYRILLIC SMALL LETTER SHCHA - "\xfa" => "\xd1\x8a", #U+044a CYRILLIC SMALL LETTER HARD SIGN - "\xfb" => "\xd1\x8b", #U+044b CYRILLIC SMALL LETTER YERU - "\xfc" => "\xd1\x8c", #U+044c CYRILLIC SMALL LETTER SOFT SIGN - "\xfd" => "\xd1\x8d", #U+044d CYRILLIC SMALL LETTER E - "\xfe" => "\xd1\x8e", #U+044e CYRILLIC SMALL LETTER YU - "\xff" => "\xd1\x8f", #U+044f CYRILLIC SMALL LETTER YA - ); - - /** - * UTF-8 Case lookup table - * - * This lookuptable defines the upper case letters to their correspponding - * lower case letter in UTF-8 - * - * @author Andreas Gohr - */ - public static $convert_case_table = array( - #CASE_UPPER => case_lower - "\x41" => "\x61", #A a - "\x42" => "\x62", #B b - "\x43" => "\x63", #C c - "\x44" => "\x64", #D d - "\x45" => "\x65", #E e - "\x46" => "\x66", #F f - "\x47" => "\x67", #G g - "\x48" => "\x68", #H h - "\x49" => "\x69", #I i - "\x4a" => "\x6a", #J j - "\x4b" => "\x6b", #K k - "\x4c" => "\x6c", #L l - "\x4d" => "\x6d", #M m - "\x4e" => "\x6e", #N n - "\x4f" => "\x6f", #O o - "\x50" => "\x70", #P p - "\x51" => "\x71", #Q q - "\x52" => "\x72", #R r - "\x53" => "\x73", #S s - "\x54" => "\x74", #T t - "\x55" => "\x75", #U u - "\x56" => "\x76", #V v - "\x57" => "\x77", #W w - "\x58" => "\x78", #X x - "\x59" => "\x79", #Y y - "\x5a" => "\x7a", #Z z - "\xc3\x80" => "\xc3\xa0", - "\xc3\x81" => "\xc3\xa1", - "\xc3\x82" => "\xc3\xa2", - "\xc3\x83" => "\xc3\xa3", - "\xc3\x84" => "\xc3\xa4", - "\xc3\x85" => "\xc3\xa5", - "\xc3\x86" => "\xc3\xa6", - "\xc3\x87" => "\xc3\xa7", - "\xc3\x88" => "\xc3\xa8", - "\xc3\x89" => "\xc3\xa9", - "\xc3\x8a" => "\xc3\xaa", - "\xc3\x8b" => "\xc3\xab", - "\xc3\x8c" => "\xc3\xac", - "\xc3\x8d" => "\xc3\xad", - "\xc3\x8e" => "\xc3\xae", - "\xc3\x8f" => "\xc3\xaf", - "\xc3\x90" => "\xc3\xb0", - "\xc3\x91" => "\xc3\xb1", - "\xc3\x92" => "\xc3\xb2", - "\xc3\x93" => "\xc3\xb3", - "\xc3\x94" => "\xc3\xb4", - "\xc3\x95" => "\xc3\xb5", - "\xc3\x96" => "\xc3\xb6", - "\xc3\x98" => "\xc3\xb8", - "\xc3\x99" => "\xc3\xb9", - "\xc3\x9a" => "\xc3\xba", - "\xc3\x9b" => "\xc3\xbb", - "\xc3\x9c" => "\xc3\xbc", - "\xc3\x9d" => "\xc3\xbd", - "\xc3\x9e" => "\xc3\xbe", - "\xc4\x80" => "\xc4\x81", - "\xc4\x82" => "\xc4\x83", - "\xc4\x84" => "\xc4\x85", - "\xc4\x86" => "\xc4\x87", - "\xc4\x88" => "\xc4\x89", - "\xc4\x8a" => "\xc4\x8b", - "\xc4\x8c" => "\xc4\x8d", - "\xc4\x8e" => "\xc4\x8f", - "\xc4\x90" => "\xc4\x91", - "\xc4\x92" => "\xc4\x93", - "\xc4\x94" => "\xc4\x95", - "\xc4\x96" => "\xc4\x97", - "\xc4\x98" => "\xc4\x99", - "\xc4\x9a" => "\xc4\x9b", - "\xc4\x9c" => "\xc4\x9d", - "\xc4\x9e" => "\xc4\x9f", - "\xc4\xa0" => "\xc4\xa1", - "\xc4\xa2" => "\xc4\xa3", - "\xc4\xa4" => "\xc4\xa5", - "\xc4\xa6" => "\xc4\xa7", - "\xc4\xa8" => "\xc4\xa9", - "\xc4\xaa" => "\xc4\xab", - "\xc4\xac" => "\xc4\xad", - "\xc4\xae" => "\xc4\xaf", - "\xc4\xb2" => "\xc4\xb3", - "\xc4\xb4" => "\xc4\xb5", - "\xc4\xb6" => "\xc4\xb7", - "\xc4\xb9" => "\xc4\xba", - "\xc4\xbb" => "\xc4\xbc", - "\xc4\xbd" => "\xc4\xbe", - "\xc4\xbf" => "\xc5\x80", - "\xc5\x81" => "\xc5\x82", - "\xc5\x83" => "\xc5\x84", - "\xc5\x85" => "\xc5\x86", - "\xc5\x87" => "\xc5\x88", - "\xc5\x8a" => "\xc5\x8b", - "\xc5\x8c" => "\xc5\x8d", - "\xc5\x8e" => "\xc5\x8f", - "\xc5\x90" => "\xc5\x91", - "\xc5\x92" => "\xc5\x93", - "\xc5\x94" => "\xc5\x95", - "\xc5\x96" => "\xc5\x97", - "\xc5\x98" => "\xc5\x99", - "\xc5\x9a" => "\xc5\x9b", - "\xc5\x9c" => "\xc5\x9d", - "\xc5\x9e" => "\xc5\x9f", - "\xc5\xa0" => "\xc5\xa1", - "\xc5\xa2" => "\xc5\xa3", - "\xc5\xa4" => "\xc5\xa5", - "\xc5\xa6" => "\xc5\xa7", - "\xc5\xa8" => "\xc5\xa9", - "\xc5\xaa" => "\xc5\xab", - "\xc5\xac" => "\xc5\xad", - "\xc5\xae" => "\xc5\xaf", - "\xc5\xb0" => "\xc5\xb1", - "\xc5\xb2" => "\xc5\xb3", - "\xc5\xb4" => "\xc5\xb5", - "\xc5\xb6" => "\xc5\xb7", - "\xc5\xb8" => "\xc3\xbf", - "\xc5\xb9" => "\xc5\xba", - "\xc5\xbb" => "\xc5\xbc", - "\xc5\xbd" => "\xc5\xbe", - "\xc6\x81" => "\xc9\x93", - "\xc6\x82" => "\xc6\x83", - "\xc6\x84" => "\xc6\x85", - "\xc6\x86" => "\xc9\x94", - "\xc6\x87" => "\xc6\x88", - "\xc6\x89" => "\xc9\x96", - "\xc6\x8a" => "\xc9\x97", - "\xc6\x8b" => "\xc6\x8c", - "\xc6\x8e" => "\xc7\x9d", - "\xc6\x8f" => "\xc9\x99", - "\xc6\x90" => "\xc9\x9b", - "\xc6\x91" => "\xc6\x92", - "\xc6\x94" => "\xc9\xa3", - "\xc6\x96" => "\xc9\xa9", - "\xc6\x97" => "\xc9\xa8", - "\xc6\x98" => "\xc6\x99", - "\xc6\x9c" => "\xc9\xaf", - "\xc6\x9d" => "\xc9\xb2", - "\xc6\x9f" => "\xc9\xb5", - "\xc6\xa0" => "\xc6\xa1", - "\xc6\xa2" => "\xc6\xa3", - "\xc6\xa4" => "\xc6\xa5", - "\xc6\xa6" => "\xca\x80", - "\xc6\xa7" => "\xc6\xa8", - "\xc6\xa9" => "\xca\x83", - "\xc6\xac" => "\xc6\xad", - "\xc6\xae" => "\xca\x88", - "\xc6\xaf" => "\xc6\xb0", - "\xc6\xb1" => "\xca\x8a", - "\xc6\xb2" => "\xca\x8b", - "\xc6\xb3" => "\xc6\xb4", - "\xc6\xb5" => "\xc6\xb6", - "\xc6\xb7" => "\xca\x92", - "\xc6\xb8" => "\xc6\xb9", - "\xc6\xbc" => "\xc6\xbd", - "\xc7\x85" => "\xc7\x86", - "\xc7\x88" => "\xc7\x89", - "\xc7\x8b" => "\xc7\x8c", - "\xc7\x8d" => "\xc7\x8e", - "\xc7\x8f" => "\xc7\x90", - "\xc7\x91" => "\xc7\x92", - "\xc7\x93" => "\xc7\x94", - "\xc7\x95" => "\xc7\x96", - "\xc7\x97" => "\xc7\x98", - "\xc7\x99" => "\xc7\x9a", - "\xc7\x9b" => "\xc7\x9c", - "\xc7\x9e" => "\xc7\x9f", - "\xc7\xa0" => "\xc7\xa1", - "\xc7\xa2" => "\xc7\xa3", - "\xc7\xa4" => "\xc7\xa5", - "\xc7\xa6" => "\xc7\xa7", - "\xc7\xa8" => "\xc7\xa9", - "\xc7\xaa" => "\xc7\xab", - "\xc7\xac" => "\xc7\xad", - "\xc7\xae" => "\xc7\xaf", - "\xc7\xb2" => "\xc7\xb3", - "\xc7\xb4" => "\xc7\xb5", - "\xc7\xb6" => "\xc6\x95", - "\xc7\xb7" => "\xc6\xbf", - "\xc7\xb8" => "\xc7\xb9", - "\xc7\xba" => "\xc7\xbb", - "\xc7\xbc" => "\xc7\xbd", - "\xc7\xbe" => "\xc7\xbf", - "\xc8\x80" => "\xc8\x81", - "\xc8\x82" => "\xc8\x83", - "\xc8\x84" => "\xc8\x85", - "\xc8\x86" => "\xc8\x87", - "\xc8\x88" => "\xc8\x89", - "\xc8\x8a" => "\xc8\x8b", - "\xc8\x8c" => "\xc8\x8d", - "\xc8\x8e" => "\xc8\x8f", - "\xc8\x90" => "\xc8\x91", - "\xc8\x92" => "\xc8\x93", - "\xc8\x94" => "\xc8\x95", - "\xc8\x96" => "\xc8\x97", - "\xc8\x98" => "\xc8\x99", - "\xc8\x9a" => "\xc8\x9b", - "\xc8\x9c" => "\xc8\x9d", - "\xc8\x9e" => "\xc8\x9f", - "\xc8\xa0" => "\xc6\x9e", - "\xc8\xa2" => "\xc8\xa3", - "\xc8\xa4" => "\xc8\xa5", - "\xc8\xa6" => "\xc8\xa7", - "\xc8\xa8" => "\xc8\xa9", - "\xc8\xaa" => "\xc8\xab", - "\xc8\xac" => "\xc8\xad", - "\xc8\xae" => "\xc8\xaf", - "\xc8\xb0" => "\xc8\xb1", - "\xc8\xb2" => "\xc8\xb3", - "\xce\x86" => "\xce\xac", - "\xce\x88" => "\xce\xad", - "\xce\x89" => "\xce\xae", - "\xce\x8a" => "\xce\xaf", - "\xce\x8c" => "\xcf\x8c", - "\xce\x8e" => "\xcf\x8d", - "\xce\x8f" => "\xcf\x8e", - "\xce\x91" => "\xce\xb1", - "\xce\x92" => "\xce\xb2", - "\xce\x93" => "\xce\xb3", - "\xce\x94" => "\xce\xb4", - "\xce\x95" => "\xce\xb5", - "\xce\x96" => "\xce\xb6", - "\xce\x97" => "\xce\xb7", - "\xce\x98" => "\xce\xb8", - "\xce\x99" => "\xce\xb9", - "\xce\x9a" => "\xce\xba", - "\xce\x9b" => "\xce\xbb", - "\xce\x9c" => "\xc2\xb5", - "\xce\x9d" => "\xce\xbd", - "\xce\x9e" => "\xce\xbe", - "\xce\x9f" => "\xce\xbf", - "\xce\xa0" => "\xcf\x80", - "\xce\xa1" => "\xcf\x81", - "\xce\xa3" => "\xcf\x82", - "\xce\xa4" => "\xcf\x84", - "\xce\xa5" => "\xcf\x85", - "\xce\xa6" => "\xcf\x86", - "\xce\xa7" => "\xcf\x87", - "\xce\xa8" => "\xcf\x88", - "\xce\xa9" => "\xcf\x89", - "\xce\xaa" => "\xcf\x8a", - "\xce\xab" => "\xcf\x8b", - "\xcf\x98" => "\xcf\x99", - "\xcf\x9a" => "\xcf\x9b", - "\xcf\x9c" => "\xcf\x9d", - "\xcf\x9e" => "\xcf\x9f", - "\xcf\xa0" => "\xcf\xa1", - "\xcf\xa2" => "\xcf\xa3", - "\xcf\xa4" => "\xcf\xa5", - "\xcf\xa6" => "\xcf\xa7", - "\xcf\xa8" => "\xcf\xa9", - "\xcf\xaa" => "\xcf\xab", - "\xcf\xac" => "\xcf\xad", - "\xcf\xae" => "\xcf\xaf", - "\xd0\x80" => "\xd1\x90", - "\xd0\x81" => "\xd1\x91", - "\xd0\x82" => "\xd1\x92", - "\xd0\x83" => "\xd1\x93", - "\xd0\x84" => "\xd1\x94", - "\xd0\x85" => "\xd1\x95", - "\xd0\x86" => "\xd1\x96", - "\xd0\x87" => "\xd1\x97", - "\xd0\x88" => "\xd1\x98", - "\xd0\x89" => "\xd1\x99", - "\xd0\x8a" => "\xd1\x9a", - "\xd0\x8b" => "\xd1\x9b", - "\xd0\x8c" => "\xd1\x9c", - "\xd0\x8d" => "\xd1\x9d", - "\xd0\x8e" => "\xd1\x9e", - "\xd0\x8f" => "\xd1\x9f", - "\xd0\x90" => "\xd0\xb0", - "\xd0\x91" => "\xd0\xb1", - "\xd0\x92" => "\xd0\xb2", - "\xd0\x93" => "\xd0\xb3", - "\xd0\x94" => "\xd0\xb4", - "\xd0\x95" => "\xd0\xb5", - "\xd0\x96" => "\xd0\xb6", - "\xd0\x97" => "\xd0\xb7", - "\xd0\x98" => "\xd0\xb8", - "\xd0\x99" => "\xd0\xb9", - "\xd0\x9a" => "\xd0\xba", - "\xd0\x9b" => "\xd0\xbb", - "\xd0\x9c" => "\xd0\xbc", - "\xd0\x9d" => "\xd0\xbd", - "\xd0\x9e" => "\xd0\xbe", - "\xd0\x9f" => "\xd0\xbf", - "\xd0\xa0" => "\xd1\x80", - "\xd0\xa1" => "\xd1\x81", - "\xd0\xa2" => "\xd1\x82", - "\xd0\xa3" => "\xd1\x83", - "\xd0\xa4" => "\xd1\x84", - "\xd0\xa5" => "\xd1\x85", - "\xd0\xa6" => "\xd1\x86", - "\xd0\xa7" => "\xd1\x87", - "\xd0\xa8" => "\xd1\x88", - "\xd0\xa9" => "\xd1\x89", - "\xd0\xaa" => "\xd1\x8a", - "\xd0\xab" => "\xd1\x8b", - "\xd0\xac" => "\xd1\x8c", - "\xd0\xad" => "\xd1\x8d", - "\xd0\xae" => "\xd1\x8e", - "\xd0\xaf" => "\xd1\x8f", - "\xd1\xa0" => "\xd1\xa1", - "\xd1\xa2" => "\xd1\xa3", - "\xd1\xa4" => "\xd1\xa5", - "\xd1\xa6" => "\xd1\xa7", - "\xd1\xa8" => "\xd1\xa9", - "\xd1\xaa" => "\xd1\xab", - "\xd1\xac" => "\xd1\xad", - "\xd1\xae" => "\xd1\xaf", - "\xd1\xb0" => "\xd1\xb1", - "\xd1\xb2" => "\xd1\xb3", - "\xd1\xb4" => "\xd1\xb5", - "\xd1\xb6" => "\xd1\xb7", - "\xd1\xb8" => "\xd1\xb9", - "\xd1\xba" => "\xd1\xbb", - "\xd1\xbc" => "\xd1\xbd", - "\xd1\xbe" => "\xd1\xbf", - "\xd2\x80" => "\xd2\x81", - "\xd2\x8a" => "\xd2\x8b", - "\xd2\x8c" => "\xd2\x8d", - "\xd2\x8e" => "\xd2\x8f", - "\xd2\x90" => "\xd2\x91", - "\xd2\x92" => "\xd2\x93", - "\xd2\x94" => "\xd2\x95", - "\xd2\x96" => "\xd2\x97", - "\xd2\x98" => "\xd2\x99", - "\xd2\x9a" => "\xd2\x9b", - "\xd2\x9c" => "\xd2\x9d", - "\xd2\x9e" => "\xd2\x9f", - "\xd2\xa0" => "\xd2\xa1", - "\xd2\xa2" => "\xd2\xa3", - "\xd2\xa4" => "\xd2\xa5", - "\xd2\xa6" => "\xd2\xa7", - "\xd2\xa8" => "\xd2\xa9", - "\xd2\xaa" => "\xd2\xab", - "\xd2\xac" => "\xd2\xad", - "\xd2\xae" => "\xd2\xaf", - "\xd2\xb0" => "\xd2\xb1", - "\xd2\xb2" => "\xd2\xb3", - "\xd2\xb4" => "\xd2\xb5", - "\xd2\xb6" => "\xd2\xb7", - "\xd2\xb8" => "\xd2\xb9", - "\xd2\xba" => "\xd2\xbb", - "\xd2\xbc" => "\xd2\xbd", - "\xd2\xbe" => "\xd2\xbf", - "\xd3\x81" => "\xd3\x82", - "\xd3\x83" => "\xd3\x84", - "\xd3\x85" => "\xd3\x86", - "\xd3\x87" => "\xd3\x88", - "\xd3\x89" => "\xd3\x8a", - "\xd3\x8b" => "\xd3\x8c", - "\xd3\x8d" => "\xd3\x8e", - "\xd3\x90" => "\xd3\x91", - "\xd3\x92" => "\xd3\x93", - "\xd3\x94" => "\xd3\x95", - "\xd3\x96" => "\xd3\x97", - "\xd3\x98" => "\xd3\x99", - "\xd3\x9a" => "\xd3\x9b", - "\xd3\x9c" => "\xd3\x9d", - "\xd3\x9e" => "\xd3\x9f", - "\xd3\xa0" => "\xd3\xa1", - "\xd3\xa2" => "\xd3\xa3", - "\xd3\xa4" => "\xd3\xa5", - "\xd3\xa6" => "\xd3\xa7", - "\xd3\xa8" => "\xd3\xa9", - "\xd3\xaa" => "\xd3\xab", - "\xd3\xac" => "\xd3\xad", - "\xd3\xae" => "\xd3\xaf", - "\xd3\xb0" => "\xd3\xb1", - "\xd3\xb2" => "\xd3\xb3", - "\xd3\xb4" => "\xd3\xb5", - "\xd3\xb8" => "\xd3\xb9", - "\xd4\x80" => "\xd4\x81", - "\xd4\x82" => "\xd4\x83", - "\xd4\x84" => "\xd4\x85", - "\xd4\x86" => "\xd4\x87", - "\xd4\x88" => "\xd4\x89", - "\xd4\x8a" => "\xd4\x8b", - "\xd4\x8c" => "\xd4\x8d", - "\xd4\x8e" => "\xd4\x8f", - "\xd4\xb1" => "\xd5\xa1", - "\xd4\xb2" => "\xd5\xa2", - "\xd4\xb3" => "\xd5\xa3", - "\xd4\xb4" => "\xd5\xa4", - "\xd4\xb5" => "\xd5\xa5", - "\xd4\xb6" => "\xd5\xa6", - "\xd4\xb7" => "\xd5\xa7", - "\xd4\xb8" => "\xd5\xa8", - "\xd4\xb9" => "\xd5\xa9", - "\xd4\xba" => "\xd5\xaa", - "\xd4\xbb" => "\xd5\xab", - "\xd4\xbc" => "\xd5\xac", - "\xd4\xbd" => "\xd5\xad", - "\xd4\xbe" => "\xd5\xae", - "\xd4\xbf" => "\xd5\xaf", - "\xd5\x80" => "\xd5\xb0", - "\xd5\x81" => "\xd5\xb1", - "\xd5\x82" => "\xd5\xb2", - "\xd5\x83" => "\xd5\xb3", - "\xd5\x84" => "\xd5\xb4", - "\xd5\x85" => "\xd5\xb5", - "\xd5\x86" => "\xd5\xb6", - "\xd5\x87" => "\xd5\xb7", - "\xd5\x88" => "\xd5\xb8", - "\xd5\x89" => "\xd5\xb9", - "\xd5\x8a" => "\xd5\xba", - "\xd5\x8b" => "\xd5\xbb", - "\xd5\x8c" => "\xd5\xbc", - "\xd5\x8d" => "\xd5\xbd", - "\xd5\x8e" => "\xd5\xbe", - "\xd5\x8f" => "\xd5\xbf", - "\xd5\x90" => "\xd6\x80", - "\xd5\x91" => "\xd6\x81", - "\xd5\x92" => "\xd6\x82", - "\xd5\x93" => "\xd6\x83", - "\xd5\x94" => "\xd6\x84", - "\xd5\x95" => "\xd6\x85", - "\xd5\x96" => "\xd6\x86", - "\xe1\xb8\x80" => "\xe1\xb8\x81", - "\xe1\xb8\x82" => "\xe1\xb8\x83", - "\xe1\xb8\x84" => "\xe1\xb8\x85", - "\xe1\xb8\x86" => "\xe1\xb8\x87", - "\xe1\xb8\x88" => "\xe1\xb8\x89", - "\xe1\xb8\x8a" => "\xe1\xb8\x8b", - "\xe1\xb8\x8c" => "\xe1\xb8\x8d", - "\xe1\xb8\x8e" => "\xe1\xb8\x8f", - "\xe1\xb8\x90" => "\xe1\xb8\x91", - "\xe1\xb8\x92" => "\xe1\xb8\x93", - "\xe1\xb8\x94" => "\xe1\xb8\x95", - "\xe1\xb8\x96" => "\xe1\xb8\x97", - "\xe1\xb8\x98" => "\xe1\xb8\x99", - "\xe1\xb8\x9a" => "\xe1\xb8\x9b", - "\xe1\xb8\x9c" => "\xe1\xb8\x9d", - "\xe1\xb8\x9e" => "\xe1\xb8\x9f", - "\xe1\xb8\xa0" => "\xe1\xb8\xa1", - "\xe1\xb8\xa2" => "\xe1\xb8\xa3", - "\xe1\xb8\xa4" => "\xe1\xb8\xa5", - "\xe1\xb8\xa6" => "\xe1\xb8\xa7", - "\xe1\xb8\xa8" => "\xe1\xb8\xa9", - "\xe1\xb8\xaa" => "\xe1\xb8\xab", - "\xe1\xb8\xac" => "\xe1\xb8\xad", - "\xe1\xb8\xae" => "\xe1\xb8\xaf", - "\xe1\xb8\xb0" => "\xe1\xb8\xb1", - "\xe1\xb8\xb2" => "\xe1\xb8\xb3", - "\xe1\xb8\xb4" => "\xe1\xb8\xb5", - "\xe1\xb8\xb6" => "\xe1\xb8\xb7", - "\xe1\xb8\xb8" => "\xe1\xb8\xb9", - "\xe1\xb8\xba" => "\xe1\xb8\xbb", - "\xe1\xb8\xbc" => "\xe1\xb8\xbd", - "\xe1\xb8\xbe" => "\xe1\xb8\xbf", - "\xe1\xb9\x80" => "\xe1\xb9\x81", - "\xe1\xb9\x82" => "\xe1\xb9\x83", - "\xe1\xb9\x84" => "\xe1\xb9\x85", - "\xe1\xb9\x86" => "\xe1\xb9\x87", - "\xe1\xb9\x88" => "\xe1\xb9\x89", - "\xe1\xb9\x8a" => "\xe1\xb9\x8b", - "\xe1\xb9\x8c" => "\xe1\xb9\x8d", - "\xe1\xb9\x8e" => "\xe1\xb9\x8f", - "\xe1\xb9\x90" => "\xe1\xb9\x91", - "\xe1\xb9\x92" => "\xe1\xb9\x93", - "\xe1\xb9\x94" => "\xe1\xb9\x95", - "\xe1\xb9\x96" => "\xe1\xb9\x97", - "\xe1\xb9\x98" => "\xe1\xb9\x99", - "\xe1\xb9\x9a" => "\xe1\xb9\x9b", - "\xe1\xb9\x9c" => "\xe1\xb9\x9d", - "\xe1\xb9\x9e" => "\xe1\xb9\x9f", - "\xe1\xb9\xa0" => "\xe1\xb9\xa1", - "\xe1\xb9\xa2" => "\xe1\xb9\xa3", - "\xe1\xb9\xa4" => "\xe1\xb9\xa5", - "\xe1\xb9\xa6" => "\xe1\xb9\xa7", - "\xe1\xb9\xa8" => "\xe1\xb9\xa9", - "\xe1\xb9\xaa" => "\xe1\xb9\xab", - "\xe1\xb9\xac" => "\xe1\xb9\xad", - "\xe1\xb9\xae" => "\xe1\xb9\xaf", - "\xe1\xb9\xb0" => "\xe1\xb9\xb1", - "\xe1\xb9\xb2" => "\xe1\xb9\xb3", - "\xe1\xb9\xb4" => "\xe1\xb9\xb5", - "\xe1\xb9\xb6" => "\xe1\xb9\xb7", - "\xe1\xb9\xb8" => "\xe1\xb9\xb9", - "\xe1\xb9\xba" => "\xe1\xb9\xbb", - "\xe1\xb9\xbc" => "\xe1\xb9\xbd", - "\xe1\xb9\xbe" => "\xe1\xb9\xbf", - "\xe1\xba\x80" => "\xe1\xba\x81", - "\xe1\xba\x82" => "\xe1\xba\x83", - "\xe1\xba\x84" => "\xe1\xba\x85", - "\xe1\xba\x86" => "\xe1\xba\x87", - "\xe1\xba\x88" => "\xe1\xba\x89", - "\xe1\xba\x8a" => "\xe1\xba\x8b", - "\xe1\xba\x8c" => "\xe1\xba\x8d", - "\xe1\xba\x8e" => "\xe1\xba\x8f", - "\xe1\xba\x90" => "\xe1\xba\x91", - "\xe1\xba\x92" => "\xe1\xba\x93", - "\xe1\xba\x94" => "\xe1\xba\x95", - "\xe1\xba\xa0" => "\xe1\xba\xa1", - "\xe1\xba\xa2" => "\xe1\xba\xa3", - "\xe1\xba\xa4" => "\xe1\xba\xa5", - "\xe1\xba\xa6" => "\xe1\xba\xa7", - "\xe1\xba\xa8" => "\xe1\xba\xa9", - "\xe1\xba\xaa" => "\xe1\xba\xab", - "\xe1\xba\xac" => "\xe1\xba\xad", - "\xe1\xba\xae" => "\xe1\xba\xaf", - "\xe1\xba\xb0" => "\xe1\xba\xb1", - "\xe1\xba\xb2" => "\xe1\xba\xb3", - "\xe1\xba\xb4" => "\xe1\xba\xb5", - "\xe1\xba\xb6" => "\xe1\xba\xb7", - "\xe1\xba\xb8" => "\xe1\xba\xb9", - "\xe1\xba\xba" => "\xe1\xba\xbb", - "\xe1\xba\xbc" => "\xe1\xba\xbd", - "\xe1\xba\xbe" => "\xe1\xba\xbf", - "\xe1\xbb\x80" => "\xe1\xbb\x81", - "\xe1\xbb\x82" => "\xe1\xbb\x83", - "\xe1\xbb\x84" => "\xe1\xbb\x85", - "\xe1\xbb\x86" => "\xe1\xbb\x87", - "\xe1\xbb\x88" => "\xe1\xbb\x89", - "\xe1\xbb\x8a" => "\xe1\xbb\x8b", - "\xe1\xbb\x8c" => "\xe1\xbb\x8d", - "\xe1\xbb\x8e" => "\xe1\xbb\x8f", - "\xe1\xbb\x90" => "\xe1\xbb\x91", - "\xe1\xbb\x92" => "\xe1\xbb\x93", - "\xe1\xbb\x94" => "\xe1\xbb\x95", - "\xe1\xbb\x96" => "\xe1\xbb\x97", - "\xe1\xbb\x98" => "\xe1\xbb\x99", - "\xe1\xbb\x9a" => "\xe1\xbb\x9b", - "\xe1\xbb\x9c" => "\xe1\xbb\x9d", - "\xe1\xbb\x9e" => "\xe1\xbb\x9f", - "\xe1\xbb\xa0" => "\xe1\xbb\xa1", - "\xe1\xbb\xa2" => "\xe1\xbb\xa3", - "\xe1\xbb\xa4" => "\xe1\xbb\xa5", - "\xe1\xbb\xa6" => "\xe1\xbb\xa7", - "\xe1\xbb\xa8" => "\xe1\xbb\xa9", - "\xe1\xbb\xaa" => "\xe1\xbb\xab", - "\xe1\xbb\xac" => "\xe1\xbb\xad", - "\xe1\xbb\xae" => "\xe1\xbb\xaf", - "\xe1\xbb\xb0" => "\xe1\xbb\xb1", - "\xe1\xbb\xb2" => "\xe1\xbb\xb3", - "\xe1\xbb\xb4" => "\xe1\xbb\xb5", - "\xe1\xbb\xb6" => "\xe1\xbb\xb7", - "\xe1\xbb\xb8" => "\xe1\xbb\xb9", - "\xe1\xbc\x88" => "\xe1\xbc\x80", - "\xe1\xbc\x89" => "\xe1\xbc\x81", - "\xe1\xbc\x8a" => "\xe1\xbc\x82", - "\xe1\xbc\x8b" => "\xe1\xbc\x83", - "\xe1\xbc\x8c" => "\xe1\xbc\x84", - "\xe1\xbc\x8d" => "\xe1\xbc\x85", - "\xe1\xbc\x8e" => "\xe1\xbc\x86", - "\xe1\xbc\x8f" => "\xe1\xbc\x87", - "\xe1\xbc\x98" => "\xe1\xbc\x90", - "\xe1\xbc\x99" => "\xe1\xbc\x91", - "\xe1\xbc\x9a" => "\xe1\xbc\x92", - "\xe1\xbc\x9b" => "\xe1\xbc\x93", - "\xe1\xbc\x9c" => "\xe1\xbc\x94", - "\xe1\xbc\x9d" => "\xe1\xbc\x95", - "\xe1\xbc\xa9" => "\xe1\xbc\xa1", - "\xe1\xbc\xaa" => "\xe1\xbc\xa2", - "\xe1\xbc\xab" => "\xe1\xbc\xa3", - "\xe1\xbc\xac" => "\xe1\xbc\xa4", - "\xe1\xbc\xad" => "\xe1\xbc\xa5", - "\xe1\xbc\xae" => "\xe1\xbc\xa6", - "\xe1\xbc\xaf" => "\xe1\xbc\xa7", - "\xe1\xbc\xb8" => "\xe1\xbc\xb0", - "\xe1\xbc\xb9" => "\xe1\xbc\xb1", - "\xe1\xbc\xba" => "\xe1\xbc\xb2", - "\xe1\xbc\xbb" => "\xe1\xbc\xb3", - "\xe1\xbc\xbc" => "\xe1\xbc\xb4", - "\xe1\xbc\xbd" => "\xe1\xbc\xb5", - "\xe1\xbc\xbe" => "\xe1\xbc\xb6", - "\xe1\xbc\xbf" => "\xe1\xbc\xb7", - "\xe1\xbd\x88" => "\xe1\xbd\x80", - "\xe1\xbd\x89" => "\xe1\xbd\x81", - "\xe1\xbd\x8a" => "\xe1\xbd\x82", - "\xe1\xbd\x8b" => "\xe1\xbd\x83", - "\xe1\xbd\x8c" => "\xe1\xbd\x84", - "\xe1\xbd\x8d" => "\xe1\xbd\x85", - "\xe1\xbd\x99" => "\xe1\xbd\x91", - "\xe1\xbd\x9b" => "\xe1\xbd\x93", - "\xe1\xbd\x9d" => "\xe1\xbd\x95", - "\xe1\xbd\x9f" => "\xe1\xbd\x97", - "\xe1\xbd\xa9" => "\xe1\xbd\xa1", - "\xe1\xbd\xaa" => "\xe1\xbd\xa2", - "\xe1\xbd\xab" => "\xe1\xbd\xa3", - "\xe1\xbd\xac" => "\xe1\xbd\xa4", - "\xe1\xbd\xad" => "\xe1\xbd\xa5", - "\xe1\xbd\xae" => "\xe1\xbd\xa6", - "\xe1\xbd\xaf" => "\xe1\xbd\xa7", - "\xe1\xbe\x88" => "\xe1\xbe\x80", - "\xe1\xbe\x89" => "\xe1\xbe\x81", - "\xe1\xbe\x8a" => "\xe1\xbe\x82", - "\xe1\xbe\x8b" => "\xe1\xbe\x83", - "\xe1\xbe\x8c" => "\xe1\xbe\x84", - "\xe1\xbe\x8d" => "\xe1\xbe\x85", - "\xe1\xbe\x8e" => "\xe1\xbe\x86", - "\xe1\xbe\x8f" => "\xe1\xbe\x87", - "\xe1\xbe\x98" => "\xe1\xbe\x90", - "\xe1\xbe\x99" => "\xe1\xbe\x91", - "\xe1\xbe\x9a" => "\xe1\xbe\x92", - "\xe1\xbe\x9b" => "\xe1\xbe\x93", - "\xe1\xbe\x9c" => "\xe1\xbe\x94", - "\xe1\xbe\x9d" => "\xe1\xbe\x95", - "\xe1\xbe\x9e" => "\xe1\xbe\x96", - "\xe1\xbe\x9f" => "\xe1\xbe\x97", - "\xe1\xbe\xa9" => "\xe1\xbe\xa1", - "\xe1\xbe\xaa" => "\xe1\xbe\xa2", - "\xe1\xbe\xab" => "\xe1\xbe\xa3", - "\xe1\xbe\xac" => "\xe1\xbe\xa4", - "\xe1\xbe\xad" => "\xe1\xbe\xa5", - "\xe1\xbe\xae" => "\xe1\xbe\xa6", - "\xe1\xbe\xaf" => "\xe1\xbe\xa7", - "\xe1\xbe\xb8" => "\xe1\xbe\xb0", - "\xe1\xbe\xb9" => "\xe1\xbe\xb1", - "\xe1\xbe\xba" => "\xe1\xbd\xb0", - "\xe1\xbe\xbb" => "\xe1\xbd\xb1", - "\xe1\xbe\xbc" => "\xe1\xbe\xb3", - "\xe1\xbf\x88" => "\xe1\xbd\xb2", - "\xe1\xbf\x89" => "\xe1\xbd\xb3", - "\xe1\xbf\x8a" => "\xe1\xbd\xb4", - "\xe1\xbf\x8b" => "\xe1\xbd\xb5", - "\xe1\xbf\x8c" => "\xe1\xbf\x83", - "\xe1\xbf\x98" => "\xe1\xbf\x90", - "\xe1\xbf\x99" => "\xe1\xbf\x91", - "\xe1\xbf\x9a" => "\xe1\xbd\xb6", - "\xe1\xbf\x9b" => "\xe1\xbd\xb7", - "\xe1\xbf\xa9" => "\xe1\xbf\xa1", - "\xe1\xbf\xaa" => "\xe1\xbd\xba", - "\xe1\xbf\xab" => "\xe1\xbd\xbb", - "\xe1\xbf\xac" => "\xe1\xbf\xa5", - "\xe1\xbf\xb8" => "\xe1\xbd\xb8", - "\xe1\xbf\xb9" => "\xe1\xbd\xb9", - "\xe1\xbf\xba" => "\xe1\xbd\xbc", - "\xe1\xbf\xbb" => "\xe1\xbd\xbd", - "\xe1\xbf\xbc" => "\xe1\xbf\xb3", - "\xef\xbc\xa1" => "\xef\xbd\x81", - "\xef\xbc\xa2" => "\xef\xbd\x82", - "\xef\xbc\xa3" => "\xef\xbd\x83", - "\xef\xbc\xa4" => "\xef\xbd\x84", - "\xef\xbc\xa5" => "\xef\xbd\x85", - "\xef\xbc\xa6" => "\xef\xbd\x86", - "\xef\xbc\xa7" => "\xef\xbd\x87", - "\xef\xbc\xa8" => "\xef\xbd\x88", - "\xef\xbc\xa9" => "\xef\xbd\x89", - "\xef\xbc\xaa" => "\xef\xbd\x8a", - "\xef\xbc\xab" => "\xef\xbd\x8b", - "\xef\xbc\xac" => "\xef\xbd\x8c", - "\xef\xbc\xad" => "\xef\xbd\x8d", - "\xef\xbc\xae" => "\xef\xbd\x8e", - "\xef\xbc\xaf" => "\xef\xbd\x8f", - "\xef\xbc\xb0" => "\xef\xbd\x90", - "\xef\xbc\xb1" => "\xef\xbd\x91", - "\xef\xbc\xb2" => "\xef\xbd\x92", - "\xef\xbc\xb3" => "\xef\xbd\x93", - "\xef\xbc\xb4" => "\xef\xbd\x94", - "\xef\xbc\xb5" => "\xef\xbd\x95", - "\xef\xbc\xb6" => "\xef\xbd\x96", - "\xef\xbc\xb7" => "\xef\xbd\x97", - "\xef\xbc\xb8" => "\xef\xbd\x98", - "\xef\xbc\xb9" => "\xef\xbd\x99", - "\xef\xbc\xba" => "\xef\xbd\x9a", - ); - - #Unicode Character Database 6.0.0 (2010-06-04) - #autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total - public static $unicode_blocks = array( - 'Basic Latin' => array( - 0 => 0x0000, - 1 => 0x007F, - 2 => 0, - ), - 'Latin-1 Supplement' => array( - 0 => 0x0080, - 1 => 0x00FF, - 2 => 1, - ), - 'Latin Extended-A' => array( - 0 => 0x0100, - 1 => 0x017F, - 2 => 2, - ), - 'Latin Extended-B' => array( - 0 => 0x0180, - 1 => 0x024F, - 2 => 3, - ), - 'IPA Extensions' => array( - 0 => 0x0250, - 1 => 0x02AF, - 2 => 4, - ), - 'Spacing Modifier Letters' => array( - 0 => 0x02B0, - 1 => 0x02FF, - 2 => 5, - ), - 'Combining Diacritical Marks' => array( - 0 => 0x0300, - 1 => 0x036F, - 2 => 6, - ), - 'Greek and Coptic' => array( - 0 => 0x0370, - 1 => 0x03FF, - 2 => 7, - ), - 'Cyrillic' => array( - 0 => 0x0400, - 1 => 0x04FF, - 2 => 8, - ), - 'Cyrillic Supplement' => array( - 0 => 0x0500, - 1 => 0x052F, - 2 => 9, - ), - 'Armenian' => array( - 0 => 0x0530, - 1 => 0x058F, - 2 => 10, - ), - 'Hebrew' => array( - 0 => 0x0590, - 1 => 0x05FF, - 2 => 11, - ), - 'Arabic' => array( - 0 => 0x0600, - 1 => 0x06FF, - 2 => 12, - ), - 'Syriac' => array( - 0 => 0x0700, - 1 => 0x074F, - 2 => 13, - ), - 'Arabic Supplement' => array( - 0 => 0x0750, - 1 => 0x077F, - 2 => 14, - ), - 'Thaana' => array( - 0 => 0x0780, - 1 => 0x07BF, - 2 => 15, - ), - 'NKo' => array( - 0 => 0x07C0, - 1 => 0x07FF, - 2 => 16, - ), - 'Samaritan' => array( - 0 => 0x0800, - 1 => 0x083F, - 2 => 17, - ), - 'Mandaic' => array( - 0 => 0x0840, - 1 => 0x085F, - 2 => 18, - ), - 'Devanagari' => array( - 0 => 0x0900, - 1 => 0x097F, - 2 => 19, - ), - 'Bengali' => array( - 0 => 0x0980, - 1 => 0x09FF, - 2 => 20, - ), - 'Gurmukhi' => array( - 0 => 0x0A00, - 1 => 0x0A7F, - 2 => 21, - ), - 'Gujarati' => array( - 0 => 0x0A80, - 1 => 0x0AFF, - 2 => 22, - ), - 'Oriya' => array( - 0 => 0x0B00, - 1 => 0x0B7F, - 2 => 23, - ), - 'Tamil' => array( - 0 => 0x0B80, - 1 => 0x0BFF, - 2 => 24, - ), - 'Telugu' => array( - 0 => 0x0C00, - 1 => 0x0C7F, - 2 => 25, - ), - 'Kannada' => array( - 0 => 0x0C80, - 1 => 0x0CFF, - 2 => 26, - ), - 'Malayalam' => array( - 0 => 0x0D00, - 1 => 0x0D7F, - 2 => 27, - ), - 'Sinhala' => array( - 0 => 0x0D80, - 1 => 0x0DFF, - 2 => 28, - ), - 'Thai' => array( - 0 => 0x0E00, - 1 => 0x0E7F, - 2 => 29, - ), - 'Lao' => array( - 0 => 0x0E80, - 1 => 0x0EFF, - 2 => 30, - ), - 'Tibetan' => array( - 0 => 0x0F00, - 1 => 0x0FFF, - 2 => 31, - ), - 'Myanmar' => array( - 0 => 0x1000, - 1 => 0x109F, - 2 => 32, - ), - 'Georgian' => array( - 0 => 0x10A0, - 1 => 0x10FF, - 2 => 33, - ), - 'Hangul Jamo' => array( - 0 => 0x1100, - 1 => 0x11FF, - 2 => 34, - ), - 'Ethiopic' => array( - 0 => 0x1200, - 1 => 0x137F, - 2 => 35, - ), - 'Ethiopic Supplement' => array( - 0 => 0x1380, - 1 => 0x139F, - 2 => 36, - ), - 'Cherokee' => array( - 0 => 0x13A0, - 1 => 0x13FF, - 2 => 37, - ), - 'Unified Canadian Aboriginal Syllabics' => array( - 0 => 0x1400, - 1 => 0x167F, - 2 => 38, - ), - 'Ogham' => array( - 0 => 0x1680, - 1 => 0x169F, - 2 => 39, - ), - 'Runic' => array( - 0 => 0x16A0, - 1 => 0x16FF, - 2 => 40, - ), - 'Tagalog' => array( - 0 => 0x1700, - 1 => 0x171F, - 2 => 41, - ), - 'Hanunoo' => array( - 0 => 0x1720, - 1 => 0x173F, - 2 => 42, - ), - 'Buhid' => array( - 0 => 0x1740, - 1 => 0x175F, - 2 => 43, - ), - 'Tagbanwa' => array( - 0 => 0x1760, - 1 => 0x177F, - 2 => 44, - ), - 'Khmer' => array( - 0 => 0x1780, - 1 => 0x17FF, - 2 => 45, - ), - 'Mongolian' => array( - 0 => 0x1800, - 1 => 0x18AF, - 2 => 46, - ), - 'Unified Canadian Aboriginal Syllabics Extended' => array( - 0 => 0x18B0, - 1 => 0x18FF, - 2 => 47, - ), - 'Limbu' => array( - 0 => 0x1900, - 1 => 0x194F, - 2 => 48, - ), - 'Tai Le' => array( - 0 => 0x1950, - 1 => 0x197F, - 2 => 49, - ), - 'New Tai Lue' => array( - 0 => 0x1980, - 1 => 0x19DF, - 2 => 50, - ), - 'Khmer Symbols' => array( - 0 => 0x19E0, - 1 => 0x19FF, - 2 => 51, - ), - 'Buginese' => array( - 0 => 0x1A00, - 1 => 0x1A1F, - 2 => 52, - ), - 'Tai Tham' => array( - 0 => 0x1A20, - 1 => 0x1AAF, - 2 => 53, - ), - 'Balinese' => array( - 0 => 0x1B00, - 1 => 0x1B7F, - 2 => 54, - ), - 'Sundanese' => array( - 0 => 0x1B80, - 1 => 0x1BBF, - 2 => 55, - ), - 'Batak' => array( - 0 => 0x1BC0, - 1 => 0x1BFF, - 2 => 56, - ), - 'Lepcha' => array( - 0 => 0x1C00, - 1 => 0x1C4F, - 2 => 57, - ), - 'Ol Chiki' => array( - 0 => 0x1C50, - 1 => 0x1C7F, - 2 => 58, - ), - 'Vedic Extensions' => array( - 0 => 0x1CD0, - 1 => 0x1CFF, - 2 => 59, - ), - 'Phonetic Extensions' => array( - 0 => 0x1D00, - 1 => 0x1D7F, - 2 => 60, - ), - 'Phonetic Extensions Supplement' => array( - 0 => 0x1D80, - 1 => 0x1DBF, - 2 => 61, - ), - 'Combining Diacritical Marks Supplement' => array( - 0 => 0x1DC0, - 1 => 0x1DFF, - 2 => 62, - ), - 'Latin Extended Additional' => array( - 0 => 0x1E00, - 1 => 0x1EFF, - 2 => 63, - ), - 'Greek Extended' => array( - 0 => 0x1F00, - 1 => 0x1FFF, - 2 => 64, - ), - 'General Punctuation' => array( - 0 => 0x2000, - 1 => 0x206F, - 2 => 65, - ), - 'Superscripts and Subscripts' => array( - 0 => 0x2070, - 1 => 0x209F, - 2 => 66, - ), - 'Currency Symbols' => array( - 0 => 0x20A0, - 1 => 0x20CF, - 2 => 67, - ), - 'Combining Diacritical Marks for Symbols' => array( - 0 => 0x20D0, - 1 => 0x20FF, - 2 => 68, - ), - 'Letterlike Symbols' => array( - 0 => 0x2100, - 1 => 0x214F, - 2 => 69, - ), - 'Number Forms' => array( - 0 => 0x2150, - 1 => 0x218F, - 2 => 70, - ), - 'Arrows' => array( - 0 => 0x2190, - 1 => 0x21FF, - 2 => 71, - ), - 'Mathematical Operators' => array( - 0 => 0x2200, - 1 => 0x22FF, - 2 => 72, - ), - 'Miscellaneous Technical' => array( - 0 => 0x2300, - 1 => 0x23FF, - 2 => 73, - ), - 'Control Pictures' => array( - 0 => 0x2400, - 1 => 0x243F, - 2 => 74, - ), - 'Optical Character Recognition' => array( - 0 => 0x2440, - 1 => 0x245F, - 2 => 75, - ), - 'Enclosed Alphanumerics' => array( - 0 => 0x2460, - 1 => 0x24FF, - 2 => 76, - ), - 'Box Drawing' => array( - 0 => 0x2500, - 1 => 0x257F, - 2 => 77, - ), - 'Block Elements' => array( - 0 => 0x2580, - 1 => 0x259F, - 2 => 78, - ), - 'Geometric Shapes' => array( - 0 => 0x25A0, - 1 => 0x25FF, - 2 => 79, - ), - 'Miscellaneous Symbols' => array( - 0 => 0x2600, - 1 => 0x26FF, - 2 => 80, - ), - 'Dingbats' => array( - 0 => 0x2700, - 1 => 0x27BF, - 2 => 81, - ), - 'Miscellaneous Mathematical Symbols-A' => array( - 0 => 0x27C0, - 1 => 0x27EF, - 2 => 82, - ), - 'Supplemental Arrows-A' => array( - 0 => 0x27F0, - 1 => 0x27FF, - 2 => 83, - ), - 'Braille Patterns' => array( - 0 => 0x2800, - 1 => 0x28FF, - 2 => 84, - ), - 'Supplemental Arrows-B' => array( - 0 => 0x2900, - 1 => 0x297F, - 2 => 85, - ), - 'Miscellaneous Mathematical Symbols-B' => array( - 0 => 0x2980, - 1 => 0x29FF, - 2 => 86, - ), - 'Supplemental Mathematical Operators' => array( - 0 => 0x2A00, - 1 => 0x2AFF, - 2 => 87, - ), - 'Miscellaneous Symbols and Arrows' => array( - 0 => 0x2B00, - 1 => 0x2BFF, - 2 => 88, - ), - 'Glagolitic' => array( - 0 => 0x2C00, - 1 => 0x2C5F, - 2 => 89, - ), - 'Latin Extended-C' => array( - 0 => 0x2C60, - 1 => 0x2C7F, - 2 => 90, - ), - 'Coptic' => array( - 0 => 0x2C80, - 1 => 0x2CFF, - 2 => 91, - ), - 'Georgian Supplement' => array( - 0 => 0x2D00, - 1 => 0x2D2F, - 2 => 92, - ), - 'Tifinagh' => array( - 0 => 0x2D30, - 1 => 0x2D7F, - 2 => 93, - ), - 'Ethiopic Extended' => array( - 0 => 0x2D80, - 1 => 0x2DDF, - 2 => 94, - ), - 'Cyrillic Extended-A' => array( - 0 => 0x2DE0, - 1 => 0x2DFF, - 2 => 95, - ), - 'Supplemental Punctuation' => array( - 0 => 0x2E00, - 1 => 0x2E7F, - 2 => 96, - ), - 'CJK Radicals Supplement' => array( - 0 => 0x2E80, - 1 => 0x2EFF, - 2 => 97, - ), - 'Kangxi Radicals' => array( - 0 => 0x2F00, - 1 => 0x2FDF, - 2 => 98, - ), - 'Ideographic Description Characters' => array( - 0 => 0x2FF0, - 1 => 0x2FFF, - 2 => 99, - ), - 'CJK Symbols and Punctuation' => array( - 0 => 0x3000, - 1 => 0x303F, - 2 => 100, - ), - 'Hiragana' => array( - 0 => 0x3040, - 1 => 0x309F, - 2 => 101, - ), - 'Katakana' => array( - 0 => 0x30A0, - 1 => 0x30FF, - 2 => 102, - ), - 'Bopomofo' => array( - 0 => 0x3100, - 1 => 0x312F, - 2 => 103, - ), - 'Hangul Compatibility Jamo' => array( - 0 => 0x3130, - 1 => 0x318F, - 2 => 104, - ), - 'Kanbun' => array( - 0 => 0x3190, - 1 => 0x319F, - 2 => 105, - ), - 'Bopomofo Extended' => array( - 0 => 0x31A0, - 1 => 0x31BF, - 2 => 106, - ), - 'CJK Strokes' => array( - 0 => 0x31C0, - 1 => 0x31EF, - 2 => 107, - ), - 'Katakana Phonetic Extensions' => array( - 0 => 0x31F0, - 1 => 0x31FF, - 2 => 108, - ), - 'Enclosed CJK Letters and Months' => array( - 0 => 0x3200, - 1 => 0x32FF, - 2 => 109, - ), - 'CJK Compatibility' => array( - 0 => 0x3300, - 1 => 0x33FF, - 2 => 110, - ), - 'CJK Unified Ideographs Extension A' => array( - 0 => 0x3400, - 1 => 0x4DBF, - 2 => 111, - ), - 'Yijing Hexagram Symbols' => array( - 0 => 0x4DC0, - 1 => 0x4DFF, - 2 => 112, - ), - 'CJK Unified Ideographs' => array( - 0 => 0x4E00, - 1 => 0x9FFF, - 2 => 113, - ), - 'Yi Syllables' => array( - 0 => 0xA000, - 1 => 0xA48F, - 2 => 114, - ), - 'Yi Radicals' => array( - 0 => 0xA490, - 1 => 0xA4CF, - 2 => 115, - ), - 'Lisu' => array( - 0 => 0xA4D0, - 1 => 0xA4FF, - 2 => 116, - ), - 'Vai' => array( - 0 => 0xA500, - 1 => 0xA63F, - 2 => 117, - ), - 'Cyrillic Extended-B' => array( - 0 => 0xA640, - 1 => 0xA69F, - 2 => 118, - ), - 'Bamum' => array( - 0 => 0xA6A0, - 1 => 0xA6FF, - 2 => 119, - ), - 'Modifier Tone Letters' => array( - 0 => 0xA700, - 1 => 0xA71F, - 2 => 120, - ), - 'Latin Extended-D' => array( - 0 => 0xA720, - 1 => 0xA7FF, - 2 => 121, - ), - 'Syloti Nagri' => array( - 0 => 0xA800, - 1 => 0xA82F, - 2 => 122, - ), - 'Common Indic Number Forms' => array( - 0 => 0xA830, - 1 => 0xA83F, - 2 => 123, - ), - 'Phags-pa' => array( - 0 => 0xA840, - 1 => 0xA87F, - 2 => 124, - ), - 'Saurashtra' => array( - 0 => 0xA880, - 1 => 0xA8DF, - 2 => 125, - ), - 'Devanagari Extended' => array( - 0 => 0xA8E0, - 1 => 0xA8FF, - 2 => 126, - ), - 'Kayah Li' => array( - 0 => 0xA900, - 1 => 0xA92F, - 2 => 127, - ), - 'Rejang' => array( - 0 => 0xA930, - 1 => 0xA95F, - 2 => 128, - ), - 'Hangul Jamo Extended-A' => array( - 0 => 0xA960, - 1 => 0xA97F, - 2 => 129, - ), - 'Javanese' => array( - 0 => 0xA980, - 1 => 0xA9DF, - 2 => 130, - ), - 'Cham' => array( - 0 => 0xAA00, - 1 => 0xAA5F, - 2 => 131, - ), - 'Myanmar Extended-A' => array( - 0 => 0xAA60, - 1 => 0xAA7F, - 2 => 132, - ), - 'Tai Viet' => array( - 0 => 0xAA80, - 1 => 0xAADF, - 2 => 133, - ), - 'Ethiopic Extended-A' => array( - 0 => 0xAB00, - 1 => 0xAB2F, - 2 => 134, - ), - 'Meetei Mayek' => array( - 0 => 0xABC0, - 1 => 0xABFF, - 2 => 135, - ), - 'Hangul Syllables' => array( - 0 => 0xAC00, - 1 => 0xD7AF, - 2 => 136, - ), - 'Hangul Jamo Extended-B' => array( - 0 => 0xD7B0, - 1 => 0xD7FF, - 2 => 137, - ), - 'High Surrogates' => array( - 0 => 0xD800, - 1 => 0xDB7F, - 2 => 138, - ), - 'High Private Use Surrogates' => array( - 0 => 0xDB80, - 1 => 0xDBFF, - 2 => 139, - ), - 'Low Surrogates' => array( - 0 => 0xDC00, - 1 => 0xDFFF, - 2 => 140, - ), - 'Private Use Area' => array( - 0 => 0xE000, - 1 => 0xF8FF, - 2 => 141, - ), - 'CJK Compatibility Ideographs' => array( - 0 => 0xF900, - 1 => 0xFAFF, - 2 => 142, - ), - 'Alphabetic Presentation Forms' => array( - 0 => 0xFB00, - 1 => 0xFB4F, - 2 => 143, - ), - 'Arabic Presentation Forms-A' => array( - 0 => 0xFB50, - 1 => 0xFDFF, - 2 => 144, - ), - 'Variation Selectors' => array( - 0 => 0xFE00, - 1 => 0xFE0F, - 2 => 145, - ), - 'Vertical Forms' => array( - 0 => 0xFE10, - 1 => 0xFE1F, - 2 => 146, - ), - 'Combining Half Marks' => array( - 0 => 0xFE20, - 1 => 0xFE2F, - 2 => 147, - ), - 'CJK Compatibility Forms' => array( - 0 => 0xFE30, - 1 => 0xFE4F, - 2 => 148, - ), - 'Small Form Variants' => array( - 0 => 0xFE50, - 1 => 0xFE6F, - 2 => 149, - ), - 'Arabic Presentation Forms-B' => array( - 0 => 0xFE70, - 1 => 0xFEFF, - 2 => 150, - ), - 'Halfwidth and Fullwidth Forms' => array( - 0 => 0xFF00, - 1 => 0xFFEF, - 2 => 151, - ), - 'Specials' => array( - 0 => 0xFFF0, - 1 => 0xFFFF, - 2 => 152, - ), - 'Linear B Syllabary' => array( - 0 => 0x10000, - 1 => 0x1007F, - 2 => 153, - ), - 'Linear B Ideograms' => array( - 0 => 0x10080, - 1 => 0x100FF, - 2 => 154, - ), - 'Aegean Numbers' => array( - 0 => 0x10100, - 1 => 0x1013F, - 2 => 155, - ), - 'Ancient Greek Numbers' => array( - 0 => 0x10140, - 1 => 0x1018F, - 2 => 156, - ), - 'Ancient Symbols' => array( - 0 => 0x10190, - 1 => 0x101CF, - 2 => 157, - ), - 'Phaistos Disc' => array( - 0 => 0x101D0, - 1 => 0x101FF, - 2 => 158, - ), - 'Lycian' => array( - 0 => 0x10280, - 1 => 0x1029F, - 2 => 159, - ), - 'Carian' => array( - 0 => 0x102A0, - 1 => 0x102DF, - 2 => 160, - ), - 'Old Italic' => array( - 0 => 0x10300, - 1 => 0x1032F, - 2 => 161, - ), - 'Gothic' => array( - 0 => 0x10330, - 1 => 0x1034F, - 2 => 162, - ), - 'Ugaritic' => array( - 0 => 0x10380, - 1 => 0x1039F, - 2 => 163, - ), - 'Old Persian' => array( - 0 => 0x103A0, - 1 => 0x103DF, - 2 => 164, - ), - 'Deseret' => array( - 0 => 0x10400, - 1 => 0x1044F, - 2 => 165, - ), - 'Shavian' => array( - 0 => 0x10450, - 1 => 0x1047F, - 2 => 166, - ), - 'Osmanya' => array( - 0 => 0x10480, - 1 => 0x104AF, - 2 => 167, - ), - 'Cypriot Syllabary' => array( - 0 => 0x10800, - 1 => 0x1083F, - 2 => 168, - ), - 'Imperial Aramaic' => array( - 0 => 0x10840, - 1 => 0x1085F, - 2 => 169, - ), - 'Phoenician' => array( - 0 => 0x10900, - 1 => 0x1091F, - 2 => 170, - ), - 'Lydian' => array( - 0 => 0x10920, - 1 => 0x1093F, - 2 => 171, - ), - 'Kharoshthi' => array( - 0 => 0x10A00, - 1 => 0x10A5F, - 2 => 172, - ), - 'Old South Arabian' => array( - 0 => 0x10A60, - 1 => 0x10A7F, - 2 => 173, - ), - 'Avestan' => array( - 0 => 0x10B00, - 1 => 0x10B3F, - 2 => 174, - ), - 'Inscriptional Parthian' => array( - 0 => 0x10B40, - 1 => 0x10B5F, - 2 => 175, - ), - 'Inscriptional Pahlavi' => array( - 0 => 0x10B60, - 1 => 0x10B7F, - 2 => 176, - ), - 'Old Turkic' => array( - 0 => 0x10C00, - 1 => 0x10C4F, - 2 => 177, - ), - 'Rumi Numeral Symbols' => array( - 0 => 0x10E60, - 1 => 0x10E7F, - 2 => 178, - ), - 'Brahmi' => array( - 0 => 0x11000, - 1 => 0x1107F, - 2 => 179, - ), - 'Kaithi' => array( - 0 => 0x11080, - 1 => 0x110CF, - 2 => 180, - ), - 'Cuneiform' => array( - 0 => 0x12000, - 1 => 0x123FF, - 2 => 181, - ), - 'Cuneiform Numbers and Punctuation' => array( - 0 => 0x12400, - 1 => 0x1247F, - 2 => 182, - ), - 'Egyptian Hieroglyphs' => array( - 0 => 0x13000, - 1 => 0x1342F, - 2 => 183, - ), - 'Bamum Supplement' => array( - 0 => 0x16800, - 1 => 0x16A3F, - 2 => 184, - ), - 'Kana Supplement' => array( - 0 => 0x1B000, - 1 => 0x1B0FF, - 2 => 185, - ), - 'Byzantine Musical Symbols' => array( - 0 => 0x1D000, - 1 => 0x1D0FF, - 2 => 186, - ), - 'Musical Symbols' => array( - 0 => 0x1D100, - 1 => 0x1D1FF, - 2 => 187, - ), - 'Ancient Greek Musical Notation' => array( - 0 => 0x1D200, - 1 => 0x1D24F, - 2 => 188, - ), - 'Tai Xuan Jing Symbols' => array( - 0 => 0x1D300, - 1 => 0x1D35F, - 2 => 189, - ), - 'Counting Rod Numerals' => array( - 0 => 0x1D360, - 1 => 0x1D37F, - 2 => 190, - ), - 'Mathematical Alphanumeric Symbols' => array( - 0 => 0x1D400, - 1 => 0x1D7FF, - 2 => 191, - ), - 'Mahjong Tiles' => array( - 0 => 0x1F000, - 1 => 0x1F02F, - 2 => 192, - ), - 'Domino Tiles' => array( - 0 => 0x1F030, - 1 => 0x1F09F, - 2 => 193, - ), - 'Playing Cards' => array( - 0 => 0x1F0A0, - 1 => 0x1F0FF, - 2 => 194, - ), - 'Enclosed Alphanumeric Supplement' => array( - 0 => 0x1F100, - 1 => 0x1F1FF, - 2 => 195, - ), - 'Enclosed Ideographic Supplement' => array( - 0 => 0x1F200, - 1 => 0x1F2FF, - 2 => 196, - ), - 'Miscellaneous Symbols And Pictographs' => array( - 0 => 0x1F300, - 1 => 0x1F5FF, - 2 => 197, - ), - 'Emoticons' => array( - 0 => 0x1F600, - 1 => 0x1F64F, - 2 => 198, - ), - 'Transport And Map Symbols' => array( - 0 => 0x1F680, - 1 => 0x1F6FF, - 2 => 199, - ), - 'Alchemical Symbols' => array( - 0 => 0x1F700, - 1 => 0x1F77F, - 2 => 200, - ), - 'CJK Unified Ideographs Extension B' => array( - 0 => 0x20000, - 1 => 0x2A6DF, - 2 => 201, - ), - 'CJK Unified Ideographs Extension C' => array( - 0 => 0x2A700, - 1 => 0x2B73F, - 2 => 202, - ), - 'CJK Unified Ideographs Extension D' => array( - 0 => 0x2B740, - 1 => 0x2B81F, - 2 => 203, - ), - 'CJK Compatibility Ideographs Supplement' => array( - 0 => 0x2F800, - 1 => 0x2FA1F, - 2 => 204, - ), - 'Tags' => array( - 0 => 0xE0000, - 1 => 0xE007F, - 2 => 205, - ), - 'Variation Selectors Supplement' => array( - 0 => 0xE0100, - 1 => 0xE01EF, - 2 => 206, - ), - 'Supplementary Private Use Area-A' => array( - 0 => 0xF0000, - 1 => 0xFFFFF, - 2 => 207, - ), - 'Supplementary Private Use Area-B' => array( - 0 => 0x100000, - 1 => 0x10FFFF, - 2 => 208, - ), - ); - - #calling the methods of this class only statically! - private function __construct() - { - } - - /** - * Remove combining diactrical marks, with possibility of the restore - * Удаляет диакритические знаки в тексте, с возможностью восстановления (опция) - * - * @param string|null $s - * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen - * @param bool $is_can_restored - * @param array|null &$restore_table - * @return string|bool|null Returns FALSE if error occurred - */ - public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if ($additional_chars) { - foreach ($additional_chars as $k => &$v) { - $v = preg_quote($v, '/'); - } - $re = '/((?>' . self::$diactrical_re . '|' . implode('|', $additional_chars) . ')+)/sxSX'; - } else { - $re = '/((?>' . self::$diactrical_re . ')+)/sxSX'; - } - if (!$is_can_restored) { - return preg_replace($re, '', $s); - } - - $restore_table = array(); - $a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE); - $c = count($a); - if ($c === 1) { - return $s; - } - $pos = 0; - $s2 = ''; - for ($i = 0; $i < $c - 1; $i += 2) { - $s2 .= $a[$i]; - #запоминаем символьные (не байтовые!) позиции - $pos += self::strlen($a[$i]); - $restore_table['offsets'][$pos] = $a[$i + 1]; - } - $restore_table['length'] = $pos + self::strlen(end($a)); - return $s2 . end($a); - } - - /** - * Restore combining diactrical marks, removed by self::diactrical_remove() - * In Russian: - * Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились! - * - * @see self::diactrical_remove() - * @param string|null $s - * @param array $restore_table - * @return string|bool|null Returns FALSE if error occurred (broken $restore_table) - */ - public static function diactrical_restore($s, array $restore_table) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if (!$restore_table) { - return $s; - } - if (!is_int(@$restore_table['length']) || - !is_array(@$restore_table['offsets']) || - $restore_table['length'] !== self::strlen($s) - ) { - return false; - } - $a = array(); - $length = $offset = 0; - $s2 = ''; - foreach ($restore_table['offsets'] as $pos => $diactricals) { - $length = $pos - $offset; - $s2 .= self::substr($s, $offset, $length) . $diactricals; - $offset = $pos; - } - return $s2 . self::substr($s, $offset, strlen($s)); - } - - /** - * Encodes data from another character encoding to UTF-8. - * - * @param array|scalar|null $data - * @param string $charset - * @return array|scalar|null Returns FALSE if error occurred - */ - public static function convert_from($data, $charset = 'cp1251') - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - return self::_convert($data, $charset, 'UTF-8'); - } - - /** - * Encodes data from UTF-8 to another character encoding. - * - * @param array|scalar|null $data - * @param string $charset - * @return array|scalar|null Returns FALSE if error occurred - */ - public static function convert_to($data, $charset = 'cp1251') - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - return self::_convert($data, 'UTF-8', $charset); - } - - /** - * Recoding the data of any structure to/from UTF-8. - * Arrays traversed recursively, recoded keys and values. - * - * @see mb_encoding_aliases() - * @param array|scalar|null $data - * @param string $charset_from - * @param string $charset_to - * @return array|scalar|null Returns FALSE if error occurred - */ - private static function _convert($data, $charset_from, $charset_to) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } #for recursive calls - if ($charset_from === $charset_to) { - return $data; - } - if (is_array($data)) { - $d = array(); - foreach ($data as $k => &$v) { - $k = self::_convert($k, $charset_from, $charset_to); - if ($k === false) { - return false; - } - $d[$k] = self::_convert($v, $charset_from, $charset_to); - if ($d[$k] === false && !is_bool($v)) { - return false; - } - } - return $d; - } - if (is_string($data)) { - #smart behaviour for errors protected + speed improve - if ($charset_from === 'UTF-8' && !self::is_utf8($data)) { - return $data; - } - if ($charset_to === 'UTF-8' && self::is_utf8($data)) { - return $data; - } - - #since PHP-5.3.x iconv() faster then mb_convert_encoding() - if (function_exists('iconv')) { - return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data); - } - if (function_exists('mb_convert_encoding')) { - return mb_convert_encoding($data, $charset_to, $charset_from); - } - - #charset_from - if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') { - return self::_convert_from_utf16($data); - } - if ($charset_from === 'cp1251' || $charset_from === 'cp1259') { - return strtr($data, self::$cp1259_table); - } - if ($charset_from === 'koi8-r' || $charset_from === 'KOI8-R') { - return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table); - } - if ($charset_from === 'iso8859-5') { - return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table); - } - if ($charset_from === 'cp866') { - return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table); - } - if ($charset_from === 'mac-cyrillic') { - return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table); - } - - #charset_to - if ($charset_to === 'cp1251' || $charset_to === 'cp1259') { - return strtr($data, array_flip(self::$cp1259_table)); - } - - #last trying - if (function_exists('recode_string')) { - $s = @recode_string($charset_from . '..' . $charset_to, $data); - if (is_string($s)) { - return $s; - } - } - - trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING); - return false; - } - return $data; - } - - /** - * Convert UTF-16 / UCS-2 encoding string to UTF-8. - * Surrogates UTF-16 are supported! - * - * In Russian: - * Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8. - * Суррогаты UTF-16 поддерживаются! - * - * @param string $s - * @param string $type 'BE' -- big endian byte order - * 'LE' -- little endian byte order - * @param bool $to_array returns array chars instead whole string? - * @return string|array|bool UTF-8 string, array chars or FALSE if error occurred - */ - private static function _convert_from_utf16($s, $type = 'BE', $to_array = false) - { - static $types = array( - 'BE' => 'n', #unsigned short (always 16 bit, big endian byte order) - 'LE' => 'v', #unsigned short (always 16 bit, little endian byte order) - ); - if (!array_key_exists($type, $types)) { - trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING); - return false; - } - #the fastest way: - if (function_exists('iconv') || function_exists('mb_convert_encoding')) { - if (function_exists('iconv')) { - $s = iconv('UTF-16' . $type, 'UTF-8', $s); - } elseif (function_exists('mb_convert_encoding')) { - $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type); - } - if (!$to_array) { - return $s; - } - return self::str_split($s); - } - - /* - http://en.wikipedia.org/wiki/UTF-16 - - The improvement that UTF-16 made over UCS-2 is its ability to encode - characters in planes 1-16, not just those in plane 0 (BMP). - - UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF) - using a pair of 16-bit words, known as a surrogate pair. - First 1000016 is subtracted from the code point to give a 20-bit value. - This is then split into two separate 10-bit values each of which is represented - as a surrogate with the most significant half placed in the first surrogate. - To allow safe use of simple word-oriented string processing, separate ranges - of values are used for the two surrogates: 0xD800-0xDBFF for the first, most - significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate. - - For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00, - and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD. - Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points - in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever - represent a character. - - http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm - http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm - - Conversion of a Unicode scalar value S to a surrogate pair : - H = Math.floor((S - 0x10000) / 0x400) + 0xD800; - L = ((S - 0x10000) % 0x400) + 0xDC00; - The conversion of a surrogate pair to a scalar value: - N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000; - */ - $a = array(); - $hi = false; - foreach (unpack($types[$type] . '*', $s) as $codepoint) { - #surrogate process - if ($hi !== false) { - $lo = $codepoint; - if ($lo < 0xDC00 || $lo > 0xDFFF) { - $a[] = "\xEF\xBF\xBD"; - } #U+FFFD REPLACEMENT CHARACTER (for broken char) - else { - $codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000; - $a[] = self::chr($codepoint); - } - $hi = false; - } elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) { - $a[] = self::chr($codepoint); - } #not surrogate - else { - $hi = $codepoint; - } #surrogate was found - } - return $to_array ? $a : implode('', $a); - } - - /** - * Strips out device control codes in the ASCII range. - * - * @param string|null String to clean - * @return string|bool|null Returns FALSE if error occurred - */ - public static function strict($s) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $s); - } - - /** - * Check the data accessory to the class of characters ASCII. - * For null, integer, float, boolean returns TRUE. - * - * Массивы обходятся рекурсивно, если в хотябы одном элементе массива - * его значение не ASCII, возвращается FALSE. - * - * @param array|scalar|null $data - * @return bool - */ - public static function is_ascii($data) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (is_array($data)) { - foreach ($data as $k => &$v) { - if (!self::is_ascii($k) || !self::is_ascii($v)) { - return false; - } - } - return true; - } - #ltrim() little faster then preg_match() - #if (is_string($data)) return preg_match('/^[\x00-\x7f]*$/sSX', $data); #deprecated - if (is_string($data)) { - return ltrim($data, "\x00..\x7f") === ''; - } - if (is_scalar($data) || null === $data) { - return true; - } #~ null, integer, float, boolean - return false; #object or resource - } - - /** - * Returns true if data is valid UTF-8 and false otherwise. - * For null, integer, float, boolean returns TRUE. - * - * The arrays are traversed recursively, if At least one element of the array - * its value is not in UTF-8, returns FALSE. - * - * @link http://www.w3.org/International/questions/qa-forms-utf-8.html - * @link http://ru3.php.net/mb_detect_encoding - * @link http://webtest.philigon.ru/articles/utf8/ - * @link http://unicode.coeurlumiere.com/ - * @param array|scalar|null $data - * @param bool $is_strict strict the range of ASCII? - * @return bool - */ - public static function is_utf8($data, $is_strict = true) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (is_array($data)) { - foreach ($data as $k => &$v) { - if (!self::is_utf8($k, $is_strict) || !self::is_utf8($v, $is_strict)) { - return false; - } - } - return true; - } - if (is_string($data)) { - if (!preg_match('~~suSX', $data)) { - return false; - } - if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) { - return false; - } - #preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')! - #if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED - if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) { - return false; - } - return true; - } - if (is_scalar($data) || null === $data) { - return true; - } #~ null, integer, float, boolean - return false; #object or resource - } - - /** - * Tries to detect if a string is in Unicode encoding - * - * @deprecated Slowly, use self::is_utf8() instead - * @see self::is_utf8() - * @param string $s текст - * @param bool $is_strict строгая проверка диапазона ASCII? - * @return bool - */ - public static function check($s, $is_strict = true) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - for ($i = 0, $len = strlen($s); $i < $len; $i++) { - $c = ord($s[$i]); - if ($c < 0x80) { - #1 byte 0bbbbbbb - - if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) { - continue; - } - } - if (($c & 0xE0) == 0xC0) { - $n = 1; - } #2 bytes 110bbbbb 10bbbbbb - elseif (($c & 0xF0) == 0xE0) { - $n = 2; - } #3 bytes 1110bbbb 10bbbbbb 10bbbbbb - elseif (($c & 0xF8) == 0xF0) { - $n = 3; - } #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb - elseif (($c & 0xFC) == 0xF8) { - $n = 4; - } #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb - elseif (($c & 0xFE) == 0xFC) { - $n = 5; - } #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb - else { - return false; - } #does not match any model - #n bytes matching 10bbbbbb follow ? - for ($j = 0; $j < $n; $j++) { - $i++; - if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80)) { - return false; - } - } - } - return true; - } - - /** - * Check the data in UTF-8 charset on given ranges of the standard UNICODE. - * The suitable alternative to regular expressions. - * - * For null, integer, float, boolean returns TRUE. - * - * Arrays traversed recursively (keys and values). - * At least if one array element value is not passed checking, it returns FALSE. - * - * @example - * #A simple check the standard named ranges: - * UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic')); - * #You can check the named, direct ranges or codepoints together: - * UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[\x20-\x7E] - * array(0x0410, 0x044F), #[A-Яa-я] - * 0x0401, #russian yo (Ё) - * 0x0451, #russian ye (ё) - * 'Arrows', - * )); - * - * @link http://www.unicode.org/charts/ - * @param array|scalar|null $data - * @param array|string $blocks - * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам - * и FALSE в противном случае или для разбитого UTF-8. - */ - public static function blocks_check($data, $blocks) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - - if (is_array($data)) { - foreach ($data as $k => &$v) { - if (!self::blocks_check($k, $blocks) || !self::blocks_check($v, $blocks)) { - return false; - } - } - return true; - } - - if (is_string($data)) { - $chars = self::str_split($data); - if ($chars === false) { - return false; - } #broken UTF-8 - unset($data); #memory free - $skip = array(); #save to cache already checked symbols - foreach ($chars as $i => $char) { - if (array_key_exists($char, $skip)) { - continue; - } #speed improve - $codepoint = self::ord($char); - if ($codepoint === false) { - return false; - } #broken UTF-8 - $is_valid = false; - $blocks = (array)$blocks; - foreach ($blocks as $j => $block) { - if (is_string($block)) { - if (!array_key_exists($block, self::$unicode_blocks)) { - trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING); - return false; - } - list($min, $max) = self::$unicode_blocks[$block]; - } elseif (is_array($block)) { - list($min, $max) = $block; - } elseif (is_int($block)) { - $min = $max = $block; - } else { - trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR); - } - if ($codepoint >= $min && $codepoint <= $max) { - $is_valid = true; - break; - } - }#foreach - if (!$is_valid) { - return false; - } - $skip[$char] = null; - }#foreach - return true; - } - if (is_scalar($data) || null === $data) { - return true; - } #~ null, integer, float, boolean - return false; #object or resource - } - - /** - * Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset encoding to UTF-8, if necessary. - * A side effect is a positive protection against XSS attacks with non-printable characters on the vulnerable PHP function. - * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8. - * For example: ?тест[тест]=тест - * - * Алгоритм работы: - * 1) Функция проверяет массивы $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES - * на корректность значений элементов кодировке UTF-8. - * 2) Значения не в UTF-8 принимаются как $charset и конвертируется в UTF-8, - * при этом байты от 0x00 до 0x7F (ASCII) сохраняются как есть. - * 3) Сконвертированные значения снова проверяются. - * Если данные опять не в кодировке UTF-8, то они считаются разбитыми и функция возвращает FALSE. - * - * NOTICE - * Функция должна вызываться после self::unescape_request()! - * - * @see self::unescape_request() - * @param bool $is_hex2bin Декодировать HEX-данные? - * Пример: 0xd09ec2a0d0bad0bed0bcd0bfd0b0d0bdd0b8d0b8 => О компании - * Параметры в URL адресах иногда бывает удобно кодировать не функцией rawurlencode(), - * а использовать следующий механизм (к тому же кодирующий данные более компактно): - * '0x' . bin2hex($string) - * @param string $charset - * @return bool Возвращает TRUE, если все значения элементов массивов в кодировке UTF-8 - * и FALSE + E_USER_WARNING в противном случае. - */ - public static function autoconvert_request($is_hex2bin = false, $charset = 'cp1251') - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - $is_converted = false; - $is_broken = false; - foreach (array('_GET', '_POST', '_COOKIE', '_FILES') as $k => $v) { - if (!array_key_exists($v, $GLOBALS)) { - continue; - } - #использовать array_walk_recursive() не предоставляется возможным, - #т.к. его callback функция не поддерживает передачу ключа по ссылке - $GLOBALS[$v] = self::_autoconvert_request_recursive($GLOBALS[$v], $is_converted, $is_broken, $is_hex2bin, $charset); - if ($is_broken) { - trigger_error('Array $' . $v . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING); - return false; - } - } - if ($is_converted) { - $_REQUEST = - (isset($_COOKIE) ? $_COOKIE : array()) + - (isset($_POST) ? $_POST : array()) + - (isset($_GET) ? $_GET : array()); - } - return true; - } - - private static function _autoconvert_request_recursive(&$data, &$is_converted, &$is_broken, $is_hex2bin, $charset) - { - if ($is_broken) { - return $data; - } #speed improve - if (is_array($data)) { - $d = array(); - foreach ($data as $k => &$v) { - $k = self::_autoconvert_request($k, $is_converted, $is_broken, $is_hex2bin, $charset); - if ($is_broken) { - return $data; - } #speed improve - $d[$k] = self::_autoconvert_request_recursive($v, $is_converted, $is_broken, $is_hex2bin, $charset); - if ($is_broken) { - return $data; - } #speed improve - } - return $d; - } - return self::_autoconvert_request($data, $is_converted, $is_broken, $is_hex2bin, $charset); - } - - private static function _autoconvert_request(&$s, &$is_converted, &$is_broken, $is_hex2bin, $charset) - { - #regexp speed improve by using strpos() - if ($is_hex2bin && strpos($s, '0x') === 0 && preg_match('/^0x((?:[\da-fA-F]{2})+)$/sSX', $s, $m)) { - $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin() - $is_converted = true; - } - if (!self::is_utf8($s)) { - $s = self::convert_from($s, $charset); - if ($s === false) { - $is_broken = true; - } elseif (!self::is_utf8($s)) { - trigger_error('String 0x ' . substr(bin2hex($s), 0, 100) . '... is not UTF-8!', E_USER_WARNING); - $is_broken = true; - } else { - $is_converted = true; - } - } - return $s; - } - - /** - * Сравнение строк - * - * @param string|null $s1 - * @param string|null $s2 - * @param string $locale For example, 'en_CA', 'ru_RU' - * @return int|bool|null Returns FALSE if error occurred - * Returns < 0 if $s1 is less than $s2; - * > 0 if $s1 is greater than $s2; - * 0 if they are equal. - */ - public static function strcmp($s1, $s2, $locale = '') - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s1 || null === $s2) { - return null; - } - if (!function_exists('collator_create')) { - return strcmp($s1, $s2); - } - # PHP 5 >= 5.3.0, PECL intl >= 1.0.0 - # If empty string ("") or "root" are passed, UCA rules will be used. - $c = new Collator($locale); - if (!$c) { - # Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened. - trigger_error(intl_get_error_message(), E_USER_WARNING); - return false; - } - return $c->compare($s1, $s2); - } - - /** - * Сравнение строк для N первых символов - * - * @param string|null $s1 - * @param string|null $s2 - * @param int $length - * @return int|bool|null Returns FALSE if error occurred - * Returns < 0 if $s1 is less than $s2; - * > 0 if $s1 is greater than $s2; - * 0 if they are equal. - */ - public static function strncmp($s1, $s2, $length) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s1 || null === $s2) { - return null; - } - return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length)); - } - - /** - * Implementation strcasecmp() function for UTF-8 encoding string. - * - * @param string|null $s1 - * @param string|null $s2 - * @return int|bool|null Returns FALSE if error occurred - * Returns < 0 if $s1 is less than $s2; - * > 0 if $s1 is greater than $s2; - * 0 if they are equal. - */ - public static function strcasecmp($s1, $s2) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s1 || null === $s2) { - return null; - } - return self::strcmp(self::lowercase($s1), self::lowercase($s2)); - } - - /** - * Converts a UTF-8 string to a UNICODE codepoints - * - * @param string|null $s UTF-8 string - * @return array|bool|null Unicode codepoints - * Returns FALSE if $s broken (not UTF-8) - */ - public static function to_unicode($s) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - $s2 = null; - #since PHP-5.3.x iconv() little faster then mb_convert_encoding() - if (function_exists('iconv')) { - $s2 = @iconv('UTF-8', 'UCS-4BE', $s); - } elseif (function_exists('mb_convert_encoding')) { - $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8'); - } - if (is_string($s2)) { - return array_values(unpack('N*', $s2)); - } - if ($s2 !== null) { - return false; - } - - $a = self::str_split($s); - if ($a === false) { - return false; - } - return array_map(array(__CLASS__, 'ord'), $a); - } - - /** - * Converts a UNICODE codepoints to a UTF-8 string - * - * @param array|null $a Unicode codepoints - * @return string|bool|null UTF-8 string - * Returns FALSE if error occurred - */ - public static function from_unicode($a) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $a) { - return $a; - } - - #since PHP-5.3.x iconv() little faster then mb_convert_encoding() - if (function_exists('iconv')) { - array_walk($a, function (&$cp) { - $cp = pack('N', $cp); - }); - $s = @iconv('UCS-4BE', 'UTF-8', implode('', $a)); - if (!is_string($s)) { - return false; - } - return $s; - } - if (function_exists('mb_convert_encoding')) { - array_walk($a, function (&$cp) { - $cp = pack('N', $cp); - }); - $s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE'); - if (!is_string($s)) { - return false; - } - return $s; - } - - return implode('', array_map(array(__CLASS__, 'chr'), $a)); - } - - /** - * Converts a UTF-8 character to a UNICODE codepoint - * - * @param string|null $char UTF-8 character - * @return int|bool|null Unicode codepoint - * Returns FALSE if $char broken (not UTF-8) - */ - public static function ord($char) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $char) { - return $char; - } - - static $cache = array(); - if (array_key_exists($char, $cache)) { - return $cache[$char]; - } #speed improve - - switch (strlen($char)) { - case 1: - return $cache[$char] = ord($char); - case 2: - return $cache[$char] = (ord($char{1}) & 63) | - ((ord($char{0}) & 31) << 6); - case 3: - return $cache[$char] = (ord($char{2}) & 63) | - ((ord($char{1}) & 63) << 6) | - ((ord($char{0}) & 15) << 12); - case 4: - return $cache[$char] = (ord($char{3}) & 63) | - ((ord($char{2}) & 63) << 6) | - ((ord($char{1}) & 63) << 12) | - ((ord($char{0}) & 7) << 18); - default: - trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING); - return false; - } - } - - /** - * Converts a UNICODE codepoint to a UTF-8 character - * - * @param int|digit|null $cp Unicode codepoint - * @return string|bool|null UTF-8 character - * Returns FALSE if error occurred - */ - public static function chr($cp) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $cp) { - return $cp; - } - - static $cache = array(); - if (array_key_exists($cp, $cache)) { - return $cache[$cp]; - } #speed improve - - if ($cp <= 0x7f) { - return $cache[$cp] = chr($cp); - } - if ($cp <= 0x7ff) { - return $cache[$cp] = chr(0xc0 | ($cp >> 6)) . - chr(0x80 | ($cp & 0x3f)); - } - if ($cp <= 0xffff) { - return $cache[$cp] = chr(0xe0 | ($cp >> 12)) . - chr(0x80 | (($cp >> 6) & 0x3f)) . - chr(0x80 | ($cp & 0x3f)); - } - if ($cp <= 0x10ffff) { - return $cache[$cp] = chr(0xf0 | ($cp >> 18)) . - chr(0x80 | (($cp >> 12) & 0x3f)) . - chr(0x80 | (($cp >> 6) & 0x3f)) . - chr(0x80 | ($cp & 0x3f)); - } - #U+FFFD REPLACEMENT CHARACTER - return $cache[$cp] = "\xEF\xBF\xBD"; - } - - /** - * Implementation chunk_split() function for UTF-8 encoding string. - * - * @param string|null $s - * @param int|digit|null $length - * @param string|null $glue - * @return string|bool|null Returns FALSE if error occurred - */ - public static function chunk_split($s, $length = null, $glue = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - $length = (int)$length; - $glue = (string)$glue; - if ($length < 1) { - $length = 76; - } - if ($glue === '') { - $glue = "\r\n"; - } - if (!is_array($a = self::str_split($s, $length))) { - return false; - } - return implode($glue, $a); - } - - /** - * Changes all keys in an array - * - * @param array|null $a - * @param int $mode {CASE_LOWER|CASE_UPPER} - * @return array|bool|null Returns FALSE if error occurred - */ - public static function array_change_key_case($a, $mode) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (!is_array($a)) { - return $a; - } - $a2 = array(); - foreach ($a as $k => $v) { - if (is_string($k)) { - $k = self::convert_case($k, $mode); - if ($k === false) { - return false; - } - } - $a2[$k] = $v; - } - return $a2; - } - - /** - * Конвертирует регистр букв в данных в кодировке UTF-8. - * Массивы обходятся рекурсивно, при этом конвертируются только значения - * в элементах массива, а ключи остаются без изменений. - * Для конвертирования только ключей используйте метод self::array_change_key_case(). - * - * @see self::array_change_key_case() - * @link http://www.unicode.org/charts/PDF/U0400.pdf - * @link http://ru.wikipedia.org/wiki/ISO_639-1 - * @param array|scalar|null $data Данные произвольной структуры - * @param int $mode {CASE_LOWER|CASE_UPPER} - * @param bool $is_ascii_optimization for speed improve - * @return scalar|bool|null Returns FALSE if error occurred - */ - public static function convert_case($data, $mode, $is_ascii_optimization = true) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - - if (is_array($data)) { - foreach ($data as $k => &$v) { - $v = self::convert_case($v, $mode); - } - return $data; - } - if (!is_string($data) || !$data) { - return $data; - } - - if ($mode === CASE_UPPER) { - if ($is_ascii_optimization && self::is_ascii($data)) { - return strtoupper($data); - } #speed improve! - #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() - #if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8'); - return strtr($data, array_flip(self::$convert_case_table)); - } - if ($mode === CASE_LOWER) { - if ($is_ascii_optimization && self::is_ascii($data)) { - return strtolower($data); - } #speed improve! - #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() - #if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8'); - return strtr($data, self::$convert_case_table); - } - trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING); - return $data; - } - - /** - * Convert a data to lower case - * - * @param array|scalar|null $data - * @return scalar|bool|null Returns FALSE if error occurred - */ - public static function lowercase($data) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - return self::convert_case($data, CASE_LOWER); - } - - /** - * Convert a data to upper case - * - * @param array|scalar|null $data - * @return scalar|null Returns FALSE if error occurred - */ - public static function uppercase($data) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - return self::convert_case($data, CASE_UPPER); - } - - /** - * Convert a data to lower case - * - * @param array|scalar|null $data - * @return scalar|bool|null Returns FALSE if error occurred - */ - public static function strtolower($data) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - return self::convert_case($data, CASE_LOWER); - } - - /** - * Convert a data to upper case - * - * @param array|scalar|null $data - * @return scalar|null Returns FALSE if error occurred - */ - public static function strtoupper($data) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - return self::convert_case($data, CASE_UPPER); - } - - /** - * Convert all HTML entities to native UTF-8 characters - * Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode() - * Все dec и hex сущности так же переводятся в UTF-8. - * - * Example: '"' or '"' or '"' will be converted to '"'. - * - * @link http://www.htmlhelp.com/reference/html40/entities/ - * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true - * - * @param scalar|null $s - * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & ") - * @return scalar|null Returns FALSE if error occurred - */ - public static function html_entity_decode($s, $is_special_chars = false) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (!is_string($s)) { - return $s; - } - - #speed improve - if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx; - || ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false - ) { - return $s; - } - - $table = self::$html_entity_table; - if ($is_special_chars) { - $table += self::$html_special_chars_table; - } - - #replace named entities - $s = strtr($s, $table); - #block below deprecated, since PHP-5.3.x strtr() 1.5 times faster - if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos)) { - foreach (array_unique($m[0]) as $entity) { - if (array_key_exists($entity, $table)) { - $s = str_replace($entity, $table[$entity], $s); - } - } - } - - #заменяем числовые dec и hex сущности: - if (strpos($s, '&#') !== false) { - #speed improve - - $class = __CLASS__; - $html_special_chars_table_flipped = array_flip(self::$html_special_chars_table); - $s = preg_replace_callback('/&#((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX', - function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars) { - $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1]; - if (!$is_special_chars) { - $char = pack('C', $codepoint); - if (array_key_exists($char, $html_special_chars_table_flipped)) { - return $html_special_chars_table_flipped[$char]; - } - } - return $class::chr($codepoint); - }, $s); - } - return $s; - } - - /** - * Convert special UTF-8 characters to HTML entities. - * Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities() - * - * @link http://www.htmlhelp.com/reference/html40/entities/ - * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true - * - * @param scalar|null $s - * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ") - * @return scalar|null Returns FALSE if error occurred - */ - public static function html_entity_encode($s, $is_special_chars_only = false) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (!is_string($s)) { - return $s; - } - - #if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table)); - if ($is_special_chars_only) { - return htmlspecialchars($s); - } - - #replace UTF-8 chars to named entities: - $s = strtr($s, array_flip(self::$html_entity_table)); - #block below deprecated, since PHP-5.3.x strtr() 3 times faster - if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes - | \xe2[\x80-\x99][\x82-\xac] #3 bytes - ) - ~sxSX', $s, $m) - ) { - $table = array_flip(self::$html_entity_table); - foreach (array_unique($m[0]) as $char) { - if (array_key_exists($char, $table)) { - $s = str_replace($char, $table[$char], $s); - } - } - } - - return $s; - } - - /** - * Make regular expression for case insensitive match - * Example (non ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]" - * Example (only ASCII): "123_test" => "(?i:123_test)" - * - * @param string $s - * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped. - * This is useful for escaping the delimiter that is required by the PCRE functions. - * The / is the most commonly used delimiter. - * @return string|bool|null Returns FALSE if error occurred - */ - public static function preg_quote_case_insensitive($s, $delimiter = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if (self::is_ascii($s)) { - return '(?i:' . preg_quote($s, $delimiter) . ')'; - } #speed improve - - $s_re = ''; - $s_lc = UTF8::lowercase($s); - if ($s_lc === false) { - return false; - } - $s_uc = UTF8::uppercase($s); - if ($s_uc === false) { - return false; - } - - $chars_lc = UTF8::str_split($s_lc); - if ($chars_lc === false) { - return false; - } - $chars_uc = UTF8::str_split($s_uc); - if ($chars_uc === false) { - return false; - } - - foreach ($chars_lc as $i => $char) { - if ($chars_lc[$i] === $chars_uc[$i]) { - $s_re .= preg_quote($chars_lc[$i], $delimiter); - } elseif (self::is_ascii($chars_lc[$i])) { - $s_re .= '[' . preg_quote($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']'; - } else { - $s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|' - . preg_quote($chars_uc[$i], $delimiter) . ')'; - } - } - return $s_re; - } - - /** - * Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag. - * This is regardless of whether you use /u modifier. - * - * @link http://bolknote.ru/2010/09/08/~2704 - * - * @param string $pattern - * @param string|null $subject - * @param array $matches - * @param int $flags - * @param int $char_offset - * @return array|bool|null Returns FALSE if error occurred - */ - public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $subject) { - return null; - } - - $byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset; - - $return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset); - if ($return === false) { - return false; - } - - if ($flags & PREG_OFFSET_CAPTURE) { - foreach ($matches as &$match) { - foreach ($match as &$a) { - $a[1] = self::strlen(substr($subject, 0, $a[1])); - } - } - } - - return $return; - } - - #alias for self::str_limit() - public static function truncate($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) - { - return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length); - } - - /** - * Обрезает текст в кодировке UTF-8 до заданной длины, - * причём последнее слово показывается целиком, а не обрывается на середине. - * Html сущности корректно обрабатываются. - * - * @param string|null $s Текст в кодировке UTF-8 - * @param int|null|digit $maxlength Ограничение длины текста - * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется - * @param bool|null &$is_cutted Текст был обрезан? - * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length, - * то текст возвращается без изменений - * @return string|bool|null Returns FALSE if error occurred - */ - public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "…" - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - $is_cutted = false; - if ($continue === null) { - $continue = "\xe2\x80\xa6"; - } - if (!$maxlength) { - $maxlength = 256; - } - - #speed improve block - #{{{ - if (strlen($s) <= $maxlength) { - return $s; - } - $s2 = str_replace("\r\n", '?', $s); - $s2 = preg_replace('/&(?> [a-zA-Z][a-zA-Z\d]+ - | \#(?> \d{1,4} - | x[\da-fA-F]{2,4} - ) - ); # html сущности (< > & ") - /sxSX', '?', $s2); - if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) { - return $s; - } - #}}} - - $r = preg_match_all('/(?> \r\n # переносы строк - | &(?> [a-zA-Z][a-zA-Z\d]+ - | \#(?> \d{1,4} - | x[\da-fA-F]{2,4} - ) - ); # html сущности (< > & ") - | . - ) - /sxuSX', $s, $m); - if ($r === false) { - return false; - } - - #d($m); - if (count($m[0]) <= $maxlength) { - return $s; - } - - $left = implode('', array_slice($m[0], 0, $maxlength)); - #из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Z\d\(\{\[] и некоторые др. символы - #нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx; - $left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F"); - if (strlen($left) !== strlen($left2)) { - $return = $left2 . $continue; - } else { - #добавляем остаток к обрезанному слову - $right = implode('', array_slice($m[0], $maxlength)); - preg_match('/^(?> [\d\)\]\}\-\.:]+ #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80! - | \p{L}+ #буквы - | \xe2\x80\x9d #закрывающие кавычки - | \xe2\x80\x99 #закрывающие кавычки - | \xe2\x80\x9c #закрывающие кавычки - | \xc2\xbb #закрывающие кавычки - )+ - /suxSX', $right, $m); - #d($m); - $right = isset($m[0]) ? rtrim($m[0], '.-') : ''; - $return = $left . $right; - if (strlen($return) !== strlen($s)) { - $return .= $continue; - } - } - if (self::strlen($s) - self::strlen($return) < $tail_min_length) { - return $s; - } - - $is_cutted = true; - return $return; - } - - /** - * Implementation str_split() function for UTF-8 encoding string. - * - * @param string|null $s - * @param int|null|digit $length - * @return array|bool|null Returns FALSE if error occurred - */ - public static function str_split($s, $length = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - $length = ($length === null) ? 1 : (int)$length; - if ($length < 1) { - return false; - } - #there are limits in regexp for {min,max}! - if (preg_match_all('~.~suSX', $s, $m) === false) { - return false; - } - if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) { - return false; - } - if ($length === 1) { - $a = $m[0]; - } else { - $a = array(); - for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) { - $a[] = implode('', array_slice($m[0], $i, $length)); - } - } - return $a; - } - - /** - * Implementation strlen() function for UTF-8 encoding string. - * - * @param string|null $s - * @return int|bool|null Returns FALSE if error occurred - */ - public static function strlen($s) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - //since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode()) - if (function_exists('mb_strlen')) { - return mb_strlen($s, 'utf-8'); - } - - /* - utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright. - It's much faster than iconv_strlen() - Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored - */ - return strlen(utf8_decode($s)); - - /* - #slowly then strlen(utf8_decode()) - if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8'); - - #Do not count UTF-8 continuation bytes - #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s)); - - #slowly then strlen(utf8_decode()) - preg_match_all('~.~suSX', $str, $m); - return count($m[0]); - - #slowly then preg_match_all() + count() - $n = 0; - for ($i = 0, $len = strlen($s); $i < $len; $i++) - { - $c = ord(substr($s, $i, 1)); - if ($c < 0x80) $n++; #single-byte (0xxxxxx) - elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx) - } - return $n; - */ - } - - /** - * Implementation strpos() function for UTF-8 encoding string - * - * @param string|null $s The entire string - * @param string|int $needle The searched substring - * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed - * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. - * If needle is not found, will return FALSE. - */ - public static function strpos($s, $needle, $offset = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if ($offset === null || $offset < 0) { - $offset = 0; - } - if (function_exists('mb_strpos')) { - return mb_strpos($s, $needle, $offset, 'utf-8'); - } - #iconv_strpos() deprecated, because slowly than self::strlen(substr()) - #if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8'); - $byte_pos = $offset; - do { - if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) { - return false; - } - } while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset); - return $char_pos; - } - - /** - * Find position of first occurrence of a case-insensitive string. - * - * @param string|null $s The entire string - * @param string|int $needle The searched substring - * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed - * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. - * If needle is not found, will return FALSE. - */ - public static function stripos($s, $needle, $offset = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if ($offset === null || $offset < 0) { - $offset = 0; - } - if (function_exists('mb_stripos')) { - return mb_stripos($s, $needle, $offset, 'utf-8'); - } - - #optimization block (speed improve) - #{{{ - $ascii_int = (int)self::is_ascii($s) + (int)self::is_ascii($needle); - if ($ascii_int === 1) { - return false; - } - if ($ascii_int === 2) { - return stripos($s, $needle, $offset); - } - #}}} - - $s = self::convert_case($s, CASE_LOWER, false); - if ($s === false) { - return false; - } - $needle = self::convert_case($needle, CASE_LOWER, false); - if ($needle === false) { - return false; - } - return self::strpos($s, $needle, $offset); - } - - /** - * Implementation strrev() function for UTF-8 encoding string - * - * @param string|null $s - * @return string|bool|null Returns FALSE if error occurred - */ - public static function strrev($s) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if (0) { - #TODO test speed - - $s = self::_convert($s, 'UTF-8', 'UTF-32'); - if (!is_string($s)) { - return false; - } - $s = implode('', array_reverse(str_split($s, 4))); - return self::_convert($s, 'UTF-32', 'UTF-8'); - } - - if (!is_array($a = self::str_split($s))) { - return false; - } - return implode('', array_reverse($a)); - } - - /** - * Implementation substr() function for UTF-8 encoding string. - * - * @link http://www.w3.org/International/questions/qa-forms-utf-8.html - * @param string|null $s - * @param int|digit $offset - * @param int|null|digit $length - * @return string|bool|null Returns FALSE if error occurred - */ - public static function substr($s, $offset, $length = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - #since PHP-5.3.x mb_substr() faster then iconv_substr() - if (function_exists('mb_substr')) { - if ($length === null) { - $length = self::strlen($s); - } - return mb_substr($s, $offset, $length, 'utf-8'); - } - if (function_exists('iconv_substr')) { - if ($length === null) { - $length = self::strlen($s); - } - return iconv_substr($s, $offset, $length, 'utf-8'); - } - - static $_s = null; - static $_a = null; - - if ($_s !== $s) { - $_a = self::str_split($_s = $s); - } - if (!is_array($_a)) { - return false; - } - if ($length !== null) { - $a = array_slice($_a, $offset, $length); - } else { - $a = array_slice($_a, $offset); - } - return implode('', $a); - } - - /** - * Implementation substr_replace() function for UTF-8 encoding string. - * - * @param string|null $s - * @param string|int $replacement - * @param int|digit $start - * @param int|null $length - * @return string|bool|null Returns FALSE if error occurred - */ - public static function substr_replace($s, $replacement, $start, $length = null) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if (!is_array($a = self::str_split($s))) { - return false; - } - array_splice($a, $start, $length, $replacement); - return implode('', $a); - } - - /** - * Implementation ucfirst() function for UTF-8 encoding string. - * Преобразует первый символ строки в кодировке UTF-8 в верхний регистр. - * - * @param string|null $s - * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? - * @return string|bool|null Returns FALSE if error occurred - */ - public static function ucfirst($s, $is_other_to_lowercase = true) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - if ($s === '' || !is_string($s)) { - return $s; - } - if (!preg_match('/^(.)(.*)$/suSX', $s, $m)) { - return false; - } - return self::uppercase($m[1]) . ($is_other_to_lowercase ? self::lowercase($m[2]) : $m[2]); - } - - /** - * Implementation ucwords() function for UTF-8 encoding string. - * Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8, - * остальные символы каждого слова преобразуются в нижний регистр. - * - * @param string|null $s - * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? - * @param string $spaces_re - * @return string|bool|null Returns FALSE if error occurred - */ - public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\pZ\s]+)~suSX') #\pXps is POSIX space: property Z or tab, NL, VT, FF, CR - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (null === $s) { - return $s; - } - - $words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); - foreach ($words as $k => $word) { - $words[$k] = self::ucfirst($word, $is_other_to_lowercase = true); - if ($words[$k] === false) { - return false; - } - } - return implode('', $words); - } - - /** - * Decodes a string in the format %uXXXX or %u{XXXXXX} in the UTF-8 string. - * - * Используется для декодирования данных типа "%u0442%u0435%u0441%u0442", - * закодированных устаревшей функцией javascript://encode(). - * Рекомендуется использовать функцию javascript://encodeURIComponent(). - * - * NOTICE - * Устаревший формат %uXXXX позволяет использовать юникод только из диапазона UCS-2, т.е. от U+0 до U+FFFF - * - * @param scalar|array|null $data - * @param bool $is_rawurlencode - * @return scalar|array|null Returns FALSE if error occurred - */ - public static function unescape($data, $is_rawurlencode = false) - { - if (!ReflectionTypeHint::isValid()) { - return false; - } - if (is_array($data)) { - $d = array(); - foreach ($data as $k => &$v) { - $k = self::unescape($k, $is_rawurlencode); - if ($k === false) { - return false; - } - $d[$k] = self::unescape($v, $is_rawurlencode); - if ($d[$k] === false && !is_bool($v)) { - return false; - } - } - return $d; - } - if (is_string($data)) { - if (strpos($data, '%u') === false) { - return $data; - } #use strpos() for speed improving - return preg_replace_callback('/%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2 - | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts - ) - /sxSX', - function (array $m) use ($is_rawurlencode) { - $codepoint = hexdec(trim($m[1], '{}')); - $char = self::chr($codepoint); - return $is_rawurlencode ? rawurlencode($char) : $char; - }, - $data); - } - if (is_scalar($data) || null === $data) { - return $data; - } #~ null, integer, float, boolean - return false; #object or resource - } - - /** - * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST - * decoded values ​​in the format %uXXXX and %u{XXXXXX}, encoded, - * for example, through an outdated javascript function escape(). - * Standard PHP5 cannot do it. - * 2) If in the HTTP_COOKIE there are parameters with the same name, - * takes the last value, not the first, as in the QUERY_STRING. - * 3) Creates an array of $_POST for non-standard Content-Type, for example, "Content-Type: application/octet-stream". - * Standard PHP5 creates an array for "Content-Type: application/x-www-form-urlencoded" and "Content-Type: multipart/form-data". - * - * Сессии, куки и независимая авторизация на поддоменах. - * - * ПРИМЕР 1 - * У рабочего сайта http://domain.com появились поддомены. - * Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com" - * В результате авторизация не работает. - * Помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична. - * Проблема в следующем: если в HTTP_COOKIE есть параметры с одинаковым именем, то берётся последнее значение, - * а не первое, как в QUERY_STRING. - * Более подробное описание: - * PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями. - * Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp" - * В этом случае сервер берёт первое значение, а не последнее. - * Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр. - * В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки: - * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com) - * "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены) - * Решение: поменять имя сессии. - * - * ПРИМЕР 2 - * Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка), - * http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2). - * Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены. - * Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com. - * Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE. - * Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com, - * для независимой авторизации нужно использовать разные имена сессий. - * Пример HTTP заголовков ответа сервера: - * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены) - * "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены) - * - * @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism - * @return void - */ - public static function unescape_request() - { - $fixed = false; - #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"! - $HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null; - if (ini_get('always_populate_raw_post_data')) { - $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA; - } - foreach (array('_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null, - '_POST' => $HTTP_RAW_POST_DATA, - '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null, - ) as $k => $v) { - if (!is_string($v)) { - continue; - } - if ($k === '_COOKIE') { - $v = preg_replace('/; *+/sSX', '&', $v); - unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING - } - if (strpos($v, '%u') !== false) { - parse_str(self::unescape($v, $is_rawurlencode = true), $GLOBALS[$k]); - $fixed = true; - continue; - } - if (array_key_exists($k, $GLOBALS)) { - continue; - } - parse_str($v, $GLOBALS[$k]); - $fixed = true; - } - if ($fixed) { - $_REQUEST = - (isset($_COOKIE) ? $_COOKIE : array()) + - (isset($_POST) ? $_POST : array()) + - (isset($_GET) ? $_GET : array()); - } - } - - /** - * Calculates the height of the edit text in