From fe31e3b07af9ce81763e6c19b5e2224dea82e232 Mon Sep 17 00:00:00 2001 From: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> Date: Sat, 30 Mar 2024 15:23:51 -0700 Subject: [PATCH] Update requests-oauthlib==2.0.0 --- lib/charset_normalizer/__main__.py | 4 + lib/charset_normalizer/assets/__init__.py | 1440 ------------ lib/charset_normalizer/cd.py | 9 +- lib/charset_normalizer/cli/__init__.py | 6 + .../cli/{normalizer.py => __main__.py} | 0 lib/charset_normalizer/constant.py | 2070 ++++++++++++++--- lib/charset_normalizer/md.py | 49 +- lib/charset_normalizer/models.py | 11 +- lib/charset_normalizer/utils.py | 43 +- lib/charset_normalizer/version.py | 2 +- lib/idna/codec.py | 34 +- lib/idna/core.py | 10 +- lib/idna/idnadata.py | 9 +- lib/idna/package_data.py | 2 +- lib/idna/uts46data.py | 454 ++-- lib/oauthlib/__init__.py | 2 +- lib/oauthlib/common.py | 6 +- lib/oauthlib/oauth1/__init__.py | 35 +- lib/oauthlib/oauth1/rfc5849/endpoints/base.py | 11 +- .../oauth1/rfc5849/endpoints/request_token.py | 4 +- .../oauth1/rfc5849/endpoints/resource.py | 4 +- .../oauth1/rfc5849/request_validator.py | 6 +- lib/oauthlib/oauth1/rfc5849/signature.py | 65 +- .../rfc6749/clients/backend_application.py | 2 +- lib/oauthlib/oauth2/rfc6749/clients/base.py | 165 +- .../rfc6749/clients/legacy_application.py | 2 +- .../rfc6749/clients/mobile_application.py | 2 +- .../rfc6749/clients/service_application.py | 4 +- .../oauth2/rfc6749/clients/web_application.py | 4 +- .../oauth2/rfc6749/endpoints/introspect.py | 10 +- .../oauth2/rfc6749/endpoints/metadata.py | 4 +- .../oauth2/rfc6749/endpoints/revocation.py | 4 +- .../rfc6749/grant_types/authorization_code.py | 18 - .../oauth2/rfc6749/grant_types/base.py | 18 + .../rfc6749/grant_types/refresh_token.py | 1 + lib/oauthlib/oauth2/rfc6749/parameters.py | 2 +- .../oauth2/rfc6749/request_validator.py | 10 +- lib/oauthlib/oauth2/rfc6749/tokens.py | 1 + lib/oauthlib/oauth2/rfc8628/clients/device.py | 7 +- .../openid/connect/core/endpoints/userinfo.py | 21 +- .../openid/connect/core/grant_types/base.py | 1 - .../connect/core/grant_types/dispatchers.py | 2 +- lib/oauthlib/openid/connect/core/tokens.py | 4 +- lib/oauthlib/uri_validate.py | 2 +- lib/requests_oauthlib/__init__.py | 3 +- .../compliance_fixes/__init__.py | 3 +- .../compliance_fixes/douban.py | 4 +- .../compliance_fixes/ebay.py | 3 +- .../compliance_fixes/facebook.py | 10 +- .../compliance_fixes/fitbit.py | 4 +- .../compliance_fixes/instagram.py | 5 +- .../compliance_fixes/mailchimp.py | 6 +- .../compliance_fixes/plentymarkets.py | 4 +- .../compliance_fixes/slack.py | 5 +- .../compliance_fixes/weibo.py | 4 +- lib/requests_oauthlib/oauth1_auth.py | 13 +- lib/requests_oauthlib/oauth1_session.py | 13 +- lib/requests_oauthlib/oauth2_auth.py | 1 - lib/requests_oauthlib/oauth2_session.py | 65 +- 59 files changed, 2413 insertions(+), 2290 deletions(-) create mode 100644 lib/charset_normalizer/__main__.py delete mode 100644 lib/charset_normalizer/assets/__init__.py rename lib/charset_normalizer/cli/{normalizer.py => __main__.py} (100%) diff --git a/lib/charset_normalizer/__main__.py b/lib/charset_normalizer/__main__.py new file mode 100644 index 00000000..beae2ef7 --- /dev/null +++ b/lib/charset_normalizer/__main__.py @@ -0,0 +1,4 @@ +from .cli import cli_detect + +if __name__ == "__main__": + cli_detect() diff --git a/lib/charset_normalizer/assets/__init__.py b/lib/charset_normalizer/assets/__init__.py deleted file mode 100644 index 9075930d..00000000 --- a/lib/charset_normalizer/assets/__init__.py +++ /dev/null @@ -1,1440 +0,0 @@ -# -*- coding: utf-8 -*- -from typing import Dict, List - -# Language label that contain the em dash "—" -# character are to be considered alternative seq to origin -FREQUENCIES: Dict[str, List[str]] = { - "English": [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "u", - "m", - "f", - "p", - "g", - "w", - "y", - "b", - "v", - "k", - "x", - "j", - "z", - "q", - ], - "English—": [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "m", - "u", - "f", - "p", - "g", - "w", - "b", - "y", - "v", - "k", - "j", - "x", - "z", - "q", - ], - "German": [ - "e", - "n", - "i", - "r", - "s", - "t", - "a", - "d", - "h", - "u", - "l", - "g", - "o", - "c", - "m", - "b", - "f", - "k", - "w", - "z", - "p", - "v", - "ü", - "ä", - "ö", - "j", - ], - "French": [ - "e", - "a", - "s", - "n", - "i", - "t", - "r", - "l", - "u", - "o", - "d", - "c", - "p", - "m", - "é", - "v", - "g", - "f", - "b", - "h", - "q", - "à", - "x", - "è", - "y", - "j", - ], - "Dutch": [ - "e", - "n", - "a", - "i", - "r", - "t", - "o", - "d", - "s", - "l", - "g", - "h", - "v", - "m", - "u", - "k", - "c", - "p", - "b", - "w", - "j", - "z", - "f", - "y", - "x", - "ë", - ], - "Italian": [ - "e", - "i", - "a", - "o", - "n", - "l", - "t", - "r", - "s", - "c", - "d", - "u", - "p", - "m", - "g", - "v", - "f", - "b", - "z", - "h", - "q", - "è", - "à", - "k", - "y", - "ò", - ], - "Polish": [ - "a", - "i", - "o", - "e", - "n", - "r", - "z", - "w", - "s", - "c", - "t", - "k", - "y", - "d", - "p", - "m", - "u", - "l", - "j", - "ł", - "g", - "b", - "h", - "ą", - "ę", - "ó", - ], - "Spanish": [ - "e", - "a", - "o", - "n", - "s", - "r", - "i", - "l", - "d", - "t", - "c", - "u", - "m", - "p", - "b", - "g", - "v", - "f", - "y", - "ó", - "h", - "q", - "í", - "j", - "z", - "á", - ], - "Russian": [ - "о", - "а", - "е", - "и", - "н", - "с", - "т", - "р", - "в", - "л", - "к", - "м", - "д", - "п", - "у", - "г", - "я", - "ы", - "з", - "б", - "й", - "ь", - "ч", - "х", - "ж", - "ц", - ], - # Jap-Kanji - "Japanese": [ - "人", - "一", - "大", - "亅", - "丁", - "丨", - "竹", - "笑", - "口", - "日", - "今", - "二", - "彳", - "行", - "十", - "土", - "丶", - "寸", - "寺", - "時", - "乙", - "丿", - "乂", - "气", - "気", - "冂", - "巾", - "亠", - "市", - "目", - "儿", - "見", - "八", - "小", - "凵", - "県", - "月", - "彐", - "門", - "間", - "木", - "東", - "山", - "出", - "本", - "中", - "刀", - "分", - "耳", - "又", - "取", - "最", - "言", - "田", - "心", - "思", - "刂", - "前", - "京", - "尹", - "事", - "生", - "厶", - "云", - "会", - "未", - "来", - "白", - "冫", - "楽", - "灬", - "馬", - "尸", - "尺", - "駅", - "明", - "耂", - "者", - "了", - "阝", - "都", - "高", - "卜", - "占", - "厂", - "广", - "店", - "子", - "申", - "奄", - "亻", - "俺", - "上", - "方", - "冖", - "学", - "衣", - "艮", - "食", - "自", - ], - # Jap-Katakana - "Japanese—": [ - "ー", - "ン", - "ス", - "・", - "ル", - "ト", - "リ", - "イ", - "ア", - "ラ", - "ッ", - "ク", - "ド", - "シ", - "レ", - "ジ", - "タ", - "フ", - "ロ", - "カ", - "テ", - "マ", - "ィ", - "グ", - "バ", - "ム", - "プ", - "オ", - "コ", - "デ", - "ニ", - "ウ", - "メ", - "サ", - "ビ", - "ナ", - "ブ", - "ャ", - "エ", - "ュ", - "チ", - "キ", - "ズ", - "ダ", - "パ", - "ミ", - "ェ", - "ョ", - "ハ", - "セ", - "ベ", - "ガ", - "モ", - "ツ", - "ネ", - "ボ", - "ソ", - "ノ", - "ァ", - "ヴ", - "ワ", - "ポ", - "ペ", - "ピ", - "ケ", - "ゴ", - "ギ", - "ザ", - "ホ", - "ゲ", - "ォ", - "ヤ", - "ヒ", - "ユ", - "ヨ", - "ヘ", - "ゼ", - "ヌ", - "ゥ", - "ゾ", - "ヶ", - "ヂ", - "ヲ", - "ヅ", - "ヵ", - "ヱ", - "ヰ", - "ヮ", - "ヽ", - "゠", - "ヾ", - "ヷ", - "ヿ", - "ヸ", - "ヹ", - "ヺ", - ], - # Jap-Hiragana - "Japanese——": [ - "の", - "に", - "る", - "た", - "と", - "は", - "し", - "い", - "を", - "で", - "て", - "が", - "な", - "れ", - "か", - "ら", - "さ", - "っ", - "り", - "す", - "あ", - "も", - "こ", - "ま", - "う", - "く", - "よ", - "き", - "ん", - "め", - "お", - "け", - "そ", - "つ", - "だ", - "や", - "え", - "ど", - "わ", - "ち", - "み", - "せ", - "じ", - "ば", - "へ", - "び", - "ず", - "ろ", - "ほ", - "げ", - "む", - "べ", - "ひ", - "ょ", - "ゆ", - "ぶ", - "ご", - "ゃ", - "ね", - "ふ", - "ぐ", - "ぎ", - "ぼ", - "ゅ", - "づ", - "ざ", - "ぞ", - "ぬ", - "ぜ", - "ぱ", - "ぽ", - "ぷ", - "ぴ", - "ぃ", - "ぁ", - "ぇ", - "ぺ", - "ゞ", - "ぢ", - "ぉ", - "ぅ", - "ゐ", - "ゝ", - "ゑ", - "゛", - "゜", - "ゎ", - "ゔ", - "゚", - "ゟ", - "゙", - "ゕ", - "ゖ", - ], - "Portuguese": [ - "a", - "e", - "o", - "s", - "i", - "r", - "d", - "n", - "t", - "m", - "u", - "c", - "l", - "p", - "g", - "v", - "b", - "f", - "h", - "ã", - "q", - "é", - "ç", - "á", - "z", - "í", - ], - "Swedish": [ - "e", - "a", - "n", - "r", - "t", - "s", - "i", - "l", - "d", - "o", - "m", - "k", - "g", - "v", - "h", - "f", - "u", - "p", - "ä", - "c", - "b", - "ö", - "å", - "y", - "j", - "x", - ], - "Chinese": [ - "的", - "一", - "是", - "不", - "了", - "在", - "人", - "有", - "我", - "他", - "这", - "个", - "们", - "中", - "来", - "上", - "大", - "为", - "和", - "国", - "地", - "到", - "以", - "说", - "时", - "要", - "就", - "出", - "会", - "可", - "也", - "你", - "对", - "生", - "能", - "而", - "子", - "那", - "得", - "于", - "着", - "下", - "自", - "之", - "年", - "过", - "发", - "后", - "作", - "里", - "用", - "道", - "行", - "所", - "然", - "家", - "种", - "事", - "成", - "方", - "多", - "经", - "么", - "去", - "法", - "学", - "如", - "都", - "同", - "现", - "当", - "没", - "动", - "面", - "起", - "看", - "定", - "天", - "分", - "还", - "进", - "好", - "小", - "部", - "其", - "些", - "主", - "样", - "理", - "心", - "她", - "本", - "前", - "开", - "但", - "因", - "只", - "从", - "想", - "实", - ], - "Ukrainian": [ - "о", - "а", - "н", - "і", - "и", - "р", - "в", - "т", - "е", - "с", - "к", - "л", - "у", - "д", - "м", - "п", - "з", - "я", - "ь", - "б", - "г", - "й", - "ч", - "х", - "ц", - "ї", - ], - "Norwegian": [ - "e", - "r", - "n", - "t", - "a", - "s", - "i", - "o", - "l", - "d", - "g", - "k", - "m", - "v", - "f", - "p", - "u", - "b", - "h", - "å", - "y", - "j", - "ø", - "c", - "æ", - "w", - ], - "Finnish": [ - "a", - "i", - "n", - "t", - "e", - "s", - "l", - "o", - "u", - "k", - "ä", - "m", - "r", - "v", - "j", - "h", - "p", - "y", - "d", - "ö", - "g", - "c", - "b", - "f", - "w", - "z", - ], - "Vietnamese": [ - "n", - "h", - "t", - "i", - "c", - "g", - "a", - "o", - "u", - "m", - "l", - "r", - "à", - "đ", - "s", - "e", - "v", - "p", - "b", - "y", - "ư", - "d", - "á", - "k", - "ộ", - "ế", - ], - "Czech": [ - "o", - "e", - "a", - "n", - "t", - "s", - "i", - "l", - "v", - "r", - "k", - "d", - "u", - "m", - "p", - "í", - "c", - "h", - "z", - "á", - "y", - "j", - "b", - "ě", - "é", - "ř", - ], - "Hungarian": [ - "e", - "a", - "t", - "l", - "s", - "n", - "k", - "r", - "i", - "o", - "z", - "á", - "é", - "g", - "m", - "b", - "y", - "v", - "d", - "h", - "u", - "p", - "j", - "ö", - "f", - "c", - ], - "Korean": [ - "이", - "다", - "에", - "의", - "는", - "로", - "하", - "을", - "가", - "고", - "지", - "서", - "한", - "은", - "기", - "으", - "년", - "대", - "사", - "시", - "를", - "리", - "도", - "인", - "스", - "일", - ], - "Indonesian": [ - "a", - "n", - "e", - "i", - "r", - "t", - "u", - "s", - "d", - "k", - "m", - "l", - "g", - "p", - "b", - "o", - "h", - "y", - "j", - "c", - "w", - "f", - "v", - "z", - "x", - "q", - ], - "Turkish": [ - "a", - "e", - "i", - "n", - "r", - "l", - "ı", - "k", - "d", - "t", - "s", - "m", - "y", - "u", - "o", - "b", - "ü", - "ş", - "v", - "g", - "z", - "h", - "c", - "p", - "ç", - "ğ", - ], - "Romanian": [ - "e", - "i", - "a", - "r", - "n", - "t", - "u", - "l", - "o", - "c", - "s", - "d", - "p", - "m", - "ă", - "f", - "v", - "î", - "g", - "b", - "ș", - "ț", - "z", - "h", - "â", - "j", - ], - "Farsi": [ - "ا", - "ی", - "ر", - "د", - "ن", - "ه", - "و", - "م", - "ت", - "ب", - "س", - "ل", - "ک", - "ش", - "ز", - "ف", - "گ", - "ع", - "خ", - "ق", - "ج", - "آ", - "پ", - "ح", - "ط", - "ص", - ], - "Arabic": [ - "ا", - "ل", - "ي", - "م", - "و", - "ن", - "ر", - "ت", - "ب", - "ة", - "ع", - "د", - "س", - "ف", - "ه", - "ك", - "ق", - "أ", - "ح", - "ج", - "ش", - "ط", - "ص", - "ى", - "خ", - "إ", - ], - "Danish": [ - "e", - "r", - "n", - "t", - "a", - "i", - "s", - "d", - "l", - "o", - "g", - "m", - "k", - "f", - "v", - "u", - "b", - "h", - "p", - "å", - "y", - "ø", - "æ", - "c", - "j", - "w", - ], - "Serbian": [ - "а", - "и", - "о", - "е", - "н", - "р", - "с", - "у", - "т", - "к", - "ј", - "в", - "д", - "м", - "п", - "л", - "г", - "з", - "б", - "a", - "i", - "e", - "o", - "n", - "ц", - "ш", - ], - "Lithuanian": [ - "i", - "a", - "s", - "o", - "r", - "e", - "t", - "n", - "u", - "k", - "m", - "l", - "p", - "v", - "d", - "j", - "g", - "ė", - "b", - "y", - "ų", - "š", - "ž", - "c", - "ą", - "į", - ], - "Slovene": [ - "e", - "a", - "i", - "o", - "n", - "r", - "s", - "l", - "t", - "j", - "v", - "k", - "d", - "p", - "m", - "u", - "z", - "b", - "g", - "h", - "č", - "c", - "š", - "ž", - "f", - "y", - ], - "Slovak": [ - "o", - "a", - "e", - "n", - "i", - "r", - "v", - "t", - "s", - "l", - "k", - "d", - "m", - "p", - "u", - "c", - "h", - "j", - "b", - "z", - "á", - "y", - "ý", - "í", - "č", - "é", - ], - "Hebrew": [ - "י", - "ו", - "ה", - "ל", - "ר", - "ב", - "ת", - "מ", - "א", - "ש", - "נ", - "ע", - "ם", - "ד", - "ק", - "ח", - "פ", - "ס", - "כ", - "ג", - "ט", - "צ", - "ן", - "ז", - "ך", - ], - "Bulgarian": [ - "а", - "и", - "о", - "е", - "н", - "т", - "р", - "с", - "в", - "л", - "к", - "д", - "п", - "м", - "з", - "г", - "я", - "ъ", - "у", - "б", - "ч", - "ц", - "й", - "ж", - "щ", - "х", - ], - "Croatian": [ - "a", - "i", - "o", - "e", - "n", - "r", - "j", - "s", - "t", - "u", - "k", - "l", - "v", - "d", - "m", - "p", - "g", - "z", - "b", - "c", - "č", - "h", - "š", - "ž", - "ć", - "f", - ], - "Hindi": [ - "क", - "र", - "स", - "न", - "त", - "म", - "ह", - "प", - "य", - "ल", - "व", - "ज", - "द", - "ग", - "ब", - "श", - "ट", - "अ", - "ए", - "थ", - "भ", - "ड", - "च", - "ध", - "ष", - "इ", - ], - "Estonian": [ - "a", - "i", - "e", - "s", - "t", - "l", - "u", - "n", - "o", - "k", - "r", - "d", - "m", - "v", - "g", - "p", - "j", - "h", - "ä", - "b", - "õ", - "ü", - "f", - "c", - "ö", - "y", - ], - "Thai": [ - "า", - "น", - "ร", - "อ", - "ก", - "เ", - "ง", - "ม", - "ย", - "ล", - "ว", - "ด", - "ท", - "ส", - "ต", - "ะ", - "ป", - "บ", - "ค", - "ห", - "แ", - "จ", - "พ", - "ช", - "ข", - "ใ", - ], - "Greek": [ - "α", - "τ", - "ο", - "ι", - "ε", - "ν", - "ρ", - "σ", - "κ", - "η", - "π", - "ς", - "υ", - "μ", - "λ", - "ί", - "ό", - "ά", - "γ", - "έ", - "δ", - "ή", - "ω", - "χ", - "θ", - "ύ", - ], - "Tamil": [ - "க", - "த", - "ப", - "ட", - "ர", - "ம", - "ல", - "ன", - "வ", - "ற", - "ய", - "ள", - "ச", - "ந", - "இ", - "ண", - "அ", - "ஆ", - "ழ", - "ங", - "எ", - "உ", - "ஒ", - "ஸ", - ], - "Kazakh": [ - "а", - "ы", - "е", - "н", - "т", - "р", - "л", - "і", - "д", - "с", - "м", - "қ", - "к", - "о", - "б", - "и", - "у", - "ғ", - "ж", - "ң", - "з", - "ш", - "й", - "п", - "г", - "ө", - ], -} diff --git a/lib/charset_normalizer/cd.py b/lib/charset_normalizer/cd.py index 6e56fe84..4ea6760c 100644 --- a/lib/charset_normalizer/cd.py +++ b/lib/charset_normalizer/cd.py @@ -4,8 +4,13 @@ from collections import Counter from functools import lru_cache from typing import Counter as TypeCounter, Dict, List, Optional, Tuple -from .assets import FREQUENCIES -from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES +from .constant import ( + FREQUENCIES, + KO_NAMES, + LANGUAGE_SUPPORTED_COUNT, + TOO_SMALL_SEQUENCE, + ZH_NAMES, +) from .md import is_suspiciously_successive_range from .models import CoherenceMatches from .utils import ( diff --git a/lib/charset_normalizer/cli/__init__.py b/lib/charset_normalizer/cli/__init__.py index e69de29b..d95fedfe 100644 --- a/lib/charset_normalizer/cli/__init__.py +++ b/lib/charset_normalizer/cli/__init__.py @@ -0,0 +1,6 @@ +from .__main__ import cli_detect, query_yes_no + +__all__ = ( + "cli_detect", + "query_yes_no", +) diff --git a/lib/charset_normalizer/cli/normalizer.py b/lib/charset_normalizer/cli/__main__.py similarity index 100% rename from lib/charset_normalizer/cli/normalizer.py rename to lib/charset_normalizer/cli/__main__.py diff --git a/lib/charset_normalizer/constant.py b/lib/charset_normalizer/constant.py index 3188108d..86349046 100644 --- a/lib/charset_normalizer/constant.py +++ b/lib/charset_normalizer/constant.py @@ -1,10 +1,9 @@ +# -*- coding: utf-8 -*- from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE from encodings.aliases import aliases from re import IGNORECASE, compile as re_compile from typing import Dict, List, Set, Union -from .assets import FREQUENCIES - # Contain for each eligible encoding a list of/item bytes SIG/BOM ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { "utf_8": BOM_UTF8, @@ -23,288 +22,338 @@ ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { TOO_SMALL_SEQUENCE: int = 32 TOO_BIG_SEQUENCE: int = int(10e6) -UTF8_MAXIMAL_ALLOCATION: int = 1112064 +UTF8_MAXIMAL_ALLOCATION: int = 1_112_064 +# Up-to-date Unicode ucd/15.0.0 UNICODE_RANGES_COMBINED: Dict[str, range] = { - "Control character": range(31 + 1), - "Basic Latin": range(32, 127 + 1), - "Latin-1 Supplement": range(128, 255 + 1), - "Latin Extended-A": range(256, 383 + 1), - "Latin Extended-B": range(384, 591 + 1), - "IPA Extensions": range(592, 687 + 1), - "Spacing Modifier Letters": range(688, 767 + 1), - "Combining Diacritical Marks": range(768, 879 + 1), - "Greek and Coptic": range(880, 1023 + 1), - "Cyrillic": range(1024, 1279 + 1), - "Cyrillic Supplement": range(1280, 1327 + 1), - "Armenian": range(1328, 1423 + 1), - "Hebrew": range(1424, 1535 + 1), - "Arabic": range(1536, 1791 + 1), - "Syriac": range(1792, 1871 + 1), - "Arabic Supplement": range(1872, 1919 + 1), - "Thaana": range(1920, 1983 + 1), - "NKo": range(1984, 2047 + 1), - "Samaritan": range(2048, 2111 + 1), - "Mandaic": range(2112, 2143 + 1), - "Syriac Supplement": range(2144, 2159 + 1), - "Arabic Extended-A": range(2208, 2303 + 1), - "Devanagari": range(2304, 2431 + 1), - "Bengali": range(2432, 2559 + 1), - "Gurmukhi": range(2560, 2687 + 1), - "Gujarati": range(2688, 2815 + 1), - "Oriya": range(2816, 2943 + 1), - "Tamil": range(2944, 3071 + 1), - "Telugu": range(3072, 3199 + 1), - "Kannada": range(3200, 3327 + 1), - "Malayalam": range(3328, 3455 + 1), - "Sinhala": range(3456, 3583 + 1), - "Thai": range(3584, 3711 + 1), - "Lao": range(3712, 3839 + 1), - "Tibetan": range(3840, 4095 + 1), - "Myanmar": range(4096, 4255 + 1), - "Georgian": range(4256, 4351 + 1), - "Hangul Jamo": range(4352, 4607 + 1), - "Ethiopic": range(4608, 4991 + 1), - "Ethiopic Supplement": range(4992, 5023 + 1), - "Cherokee": range(5024, 5119 + 1), - "Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1), - "Ogham": range(5760, 5791 + 1), - "Runic": range(5792, 5887 + 1), - "Tagalog": range(5888, 5919 + 1), - "Hanunoo": range(5920, 5951 + 1), - "Buhid": range(5952, 5983 + 1), - "Tagbanwa": range(5984, 6015 + 1), - "Khmer": range(6016, 6143 + 1), - "Mongolian": range(6144, 6319 + 1), - "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1), - "Limbu": range(6400, 6479 + 1), - "Tai Le": range(6480, 6527 + 1), - "New Tai Lue": range(6528, 6623 + 1), - "Khmer Symbols": range(6624, 6655 + 1), - "Buginese": range(6656, 6687 + 1), - "Tai Tham": range(6688, 6831 + 1), - "Combining Diacritical Marks Extended": range(6832, 6911 + 1), - "Balinese": range(6912, 7039 + 1), - "Sundanese": range(7040, 7103 + 1), - "Batak": range(7104, 7167 + 1), - "Lepcha": range(7168, 7247 + 1), - "Ol Chiki": range(7248, 7295 + 1), - "Cyrillic Extended C": range(7296, 7311 + 1), - "Sundanese Supplement": range(7360, 7375 + 1), - "Vedic Extensions": range(7376, 7423 + 1), - "Phonetic Extensions": range(7424, 7551 + 1), - "Phonetic Extensions Supplement": range(7552, 7615 + 1), - "Combining Diacritical Marks Supplement": range(7616, 7679 + 1), - "Latin Extended Additional": range(7680, 7935 + 1), - "Greek Extended": range(7936, 8191 + 1), - "General Punctuation": range(8192, 8303 + 1), - "Superscripts and Subscripts": range(8304, 8351 + 1), - "Currency Symbols": range(8352, 8399 + 1), - "Combining Diacritical Marks for Symbols": range(8400, 8447 + 1), - "Letterlike Symbols": range(8448, 8527 + 1), - "Number Forms": range(8528, 8591 + 1), - "Arrows": range(8592, 8703 + 1), - "Mathematical Operators": range(8704, 8959 + 1), - "Miscellaneous Technical": range(8960, 9215 + 1), - "Control Pictures": range(9216, 9279 + 1), - "Optical Character Recognition": range(9280, 9311 + 1), - "Enclosed Alphanumerics": range(9312, 9471 + 1), - "Box Drawing": range(9472, 9599 + 1), - "Block Elements": range(9600, 9631 + 1), - "Geometric Shapes": range(9632, 9727 + 1), - "Miscellaneous Symbols": range(9728, 9983 + 1), - "Dingbats": range(9984, 10175 + 1), - "Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1), - "Supplemental Arrows-A": range(10224, 10239 + 1), - "Braille Patterns": range(10240, 10495 + 1), - "Supplemental Arrows-B": range(10496, 10623 + 1), - "Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1), - "Supplemental Mathematical Operators": range(10752, 11007 + 1), - "Miscellaneous Symbols and Arrows": range(11008, 11263 + 1), - "Glagolitic": range(11264, 11359 + 1), - "Latin Extended-C": range(11360, 11391 + 1), - "Coptic": range(11392, 11519 + 1), - "Georgian Supplement": range(11520, 11567 + 1), - "Tifinagh": range(11568, 11647 + 1), - "Ethiopic Extended": range(11648, 11743 + 1), - "Cyrillic Extended-A": range(11744, 11775 + 1), - "Supplemental Punctuation": range(11776, 11903 + 1), - "CJK Radicals Supplement": range(11904, 12031 + 1), - "Kangxi Radicals": range(12032, 12255 + 1), - "Ideographic Description Characters": range(12272, 12287 + 1), - "CJK Symbols and Punctuation": range(12288, 12351 + 1), - "Hiragana": range(12352, 12447 + 1), - "Katakana": range(12448, 12543 + 1), - "Bopomofo": range(12544, 12591 + 1), - "Hangul Compatibility Jamo": range(12592, 12687 + 1), - "Kanbun": range(12688, 12703 + 1), - "Bopomofo Extended": range(12704, 12735 + 1), - "CJK Strokes": range(12736, 12783 + 1), - "Katakana Phonetic Extensions": range(12784, 12799 + 1), - "Enclosed CJK Letters and Months": range(12800, 13055 + 1), - "CJK Compatibility": range(13056, 13311 + 1), - "CJK Unified Ideographs Extension A": range(13312, 19903 + 1), - "Yijing Hexagram Symbols": range(19904, 19967 + 1), - "CJK Unified Ideographs": range(19968, 40959 + 1), - "Yi Syllables": range(40960, 42127 + 1), - "Yi Radicals": range(42128, 42191 + 1), - "Lisu": range(42192, 42239 + 1), - "Vai": range(42240, 42559 + 1), - "Cyrillic Extended-B": range(42560, 42655 + 1), - "Bamum": range(42656, 42751 + 1), - "Modifier Tone Letters": range(42752, 42783 + 1), - "Latin Extended-D": range(42784, 43007 + 1), - "Syloti Nagri": range(43008, 43055 + 1), - "Common Indic Number Forms": range(43056, 43071 + 1), - "Phags-pa": range(43072, 43135 + 1), - "Saurashtra": range(43136, 43231 + 1), - "Devanagari Extended": range(43232, 43263 + 1), - "Kayah Li": range(43264, 43311 + 1), - "Rejang": range(43312, 43359 + 1), - "Hangul Jamo Extended-A": range(43360, 43391 + 1), - "Javanese": range(43392, 43487 + 1), - "Myanmar Extended-B": range(43488, 43519 + 1), - "Cham": range(43520, 43615 + 1), - "Myanmar Extended-A": range(43616, 43647 + 1), - "Tai Viet": range(43648, 43743 + 1), - "Meetei Mayek Extensions": range(43744, 43775 + 1), - "Ethiopic Extended-A": range(43776, 43823 + 1), - "Latin Extended-E": range(43824, 43887 + 1), - "Cherokee Supplement": range(43888, 43967 + 1), - "Meetei Mayek": range(43968, 44031 + 1), - "Hangul Syllables": range(44032, 55215 + 1), - "Hangul Jamo Extended-B": range(55216, 55295 + 1), - "High Surrogates": range(55296, 56191 + 1), - "High Private Use Surrogates": range(56192, 56319 + 1), - "Low Surrogates": range(56320, 57343 + 1), - "Private Use Area": range(57344, 63743 + 1), - "CJK Compatibility Ideographs": range(63744, 64255 + 1), - "Alphabetic Presentation Forms": range(64256, 64335 + 1), - "Arabic Presentation Forms-A": range(64336, 65023 + 1), - "Variation Selectors": range(65024, 65039 + 1), - "Vertical Forms": range(65040, 65055 + 1), - "Combining Half Marks": range(65056, 65071 + 1), - "CJK Compatibility Forms": range(65072, 65103 + 1), - "Small Form Variants": range(65104, 65135 + 1), - "Arabic Presentation Forms-B": range(65136, 65279 + 1), - "Halfwidth and Fullwidth Forms": range(65280, 65519 + 1), - "Specials": range(65520, 65535 + 1), - "Linear B Syllabary": range(65536, 65663 + 1), - "Linear B Ideograms": range(65664, 65791 + 1), - "Aegean Numbers": range(65792, 65855 + 1), - "Ancient Greek Numbers": range(65856, 65935 + 1), - "Ancient Symbols": range(65936, 65999 + 1), - "Phaistos Disc": range(66000, 66047 + 1), - "Lycian": range(66176, 66207 + 1), - "Carian": range(66208, 66271 + 1), - "Coptic Epact Numbers": range(66272, 66303 + 1), - "Old Italic": range(66304, 66351 + 1), - "Gothic": range(66352, 66383 + 1), - "Old Permic": range(66384, 66431 + 1), - "Ugaritic": range(66432, 66463 + 1), - "Old Persian": range(66464, 66527 + 1), - "Deseret": range(66560, 66639 + 1), - "Shavian": range(66640, 66687 + 1), - "Osmanya": range(66688, 66735 + 1), - "Osage": range(66736, 66815 + 1), - "Elbasan": range(66816, 66863 + 1), - "Caucasian Albanian": range(66864, 66927 + 1), - "Linear A": range(67072, 67455 + 1), - "Cypriot Syllabary": range(67584, 67647 + 1), - "Imperial Aramaic": range(67648, 67679 + 1), - "Palmyrene": range(67680, 67711 + 1), - "Nabataean": range(67712, 67759 + 1), - "Hatran": range(67808, 67839 + 1), - "Phoenician": range(67840, 67871 + 1), - "Lydian": range(67872, 67903 + 1), - "Meroitic Hieroglyphs": range(67968, 67999 + 1), - "Meroitic Cursive": range(68000, 68095 + 1), - "Kharoshthi": range(68096, 68191 + 1), - "Old South Arabian": range(68192, 68223 + 1), - "Old North Arabian": range(68224, 68255 + 1), - "Manichaean": range(68288, 68351 + 1), - "Avestan": range(68352, 68415 + 1), - "Inscriptional Parthian": range(68416, 68447 + 1), - "Inscriptional Pahlavi": range(68448, 68479 + 1), - "Psalter Pahlavi": range(68480, 68527 + 1), - "Old Turkic": range(68608, 68687 + 1), - "Old Hungarian": range(68736, 68863 + 1), - "Rumi Numeral Symbols": range(69216, 69247 + 1), - "Brahmi": range(69632, 69759 + 1), - "Kaithi": range(69760, 69839 + 1), - "Sora Sompeng": range(69840, 69887 + 1), - "Chakma": range(69888, 69967 + 1), - "Mahajani": range(69968, 70015 + 1), - "Sharada": range(70016, 70111 + 1), - "Sinhala Archaic Numbers": range(70112, 70143 + 1), - "Khojki": range(70144, 70223 + 1), - "Multani": range(70272, 70319 + 1), - "Khudawadi": range(70320, 70399 + 1), - "Grantha": range(70400, 70527 + 1), - "Newa": range(70656, 70783 + 1), - "Tirhuta": range(70784, 70879 + 1), - "Siddham": range(71040, 71167 + 1), - "Modi": range(71168, 71263 + 1), - "Mongolian Supplement": range(71264, 71295 + 1), - "Takri": range(71296, 71375 + 1), - "Ahom": range(71424, 71487 + 1), - "Warang Citi": range(71840, 71935 + 1), - "Zanabazar Square": range(72192, 72271 + 1), - "Soyombo": range(72272, 72367 + 1), - "Pau Cin Hau": range(72384, 72447 + 1), - "Bhaiksuki": range(72704, 72815 + 1), - "Marchen": range(72816, 72895 + 1), - "Masaram Gondi": range(72960, 73055 + 1), - "Cuneiform": range(73728, 74751 + 1), - "Cuneiform Numbers and Punctuation": range(74752, 74879 + 1), - "Early Dynastic Cuneiform": range(74880, 75087 + 1), - "Egyptian Hieroglyphs": range(77824, 78895 + 1), - "Anatolian Hieroglyphs": range(82944, 83583 + 1), - "Bamum Supplement": range(92160, 92735 + 1), - "Mro": range(92736, 92783 + 1), - "Bassa Vah": range(92880, 92927 + 1), - "Pahawh Hmong": range(92928, 93071 + 1), - "Miao": range(93952, 94111 + 1), - "Ideographic Symbols and Punctuation": range(94176, 94207 + 1), - "Tangut": range(94208, 100351 + 1), - "Tangut Components": range(100352, 101119 + 1), - "Kana Supplement": range(110592, 110847 + 1), - "Kana Extended-A": range(110848, 110895 + 1), - "Nushu": range(110960, 111359 + 1), - "Duployan": range(113664, 113823 + 1), - "Shorthand Format Controls": range(113824, 113839 + 1), - "Byzantine Musical Symbols": range(118784, 119039 + 1), - "Musical Symbols": range(119040, 119295 + 1), - "Ancient Greek Musical Notation": range(119296, 119375 + 1), - "Tai Xuan Jing Symbols": range(119552, 119647 + 1), - "Counting Rod Numerals": range(119648, 119679 + 1), - "Mathematical Alphanumeric Symbols": range(119808, 120831 + 1), - "Sutton SignWriting": range(120832, 121519 + 1), - "Glagolitic Supplement": range(122880, 122927 + 1), - "Mende Kikakui": range(124928, 125151 + 1), - "Adlam": range(125184, 125279 + 1), - "Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1), - "Mahjong Tiles": range(126976, 127023 + 1), - "Domino Tiles": range(127024, 127135 + 1), - "Playing Cards": range(127136, 127231 + 1), - "Enclosed Alphanumeric Supplement": range(127232, 127487 + 1), - "Enclosed Ideographic Supplement": range(127488, 127743 + 1), - "Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1), - "Emoticons range(Emoji)": range(128512, 128591 + 1), - "Ornamental Dingbats": range(128592, 128639 + 1), - "Transport and Map Symbols": range(128640, 128767 + 1), - "Alchemical Symbols": range(128768, 128895 + 1), - "Geometric Shapes Extended": range(128896, 129023 + 1), - "Supplemental Arrows-C": range(129024, 129279 + 1), - "Supplemental Symbols and Pictographs": range(129280, 129535 + 1), - "CJK Unified Ideographs Extension B": range(131072, 173791 + 1), - "CJK Unified Ideographs Extension C": range(173824, 177983 + 1), - "CJK Unified Ideographs Extension D": range(177984, 178207 + 1), - "CJK Unified Ideographs Extension E": range(178208, 183983 + 1), - "CJK Unified Ideographs Extension F": range(183984, 191471 + 1), - "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1), - "Tags": range(917504, 917631 + 1), - "Variation Selectors Supplement": range(917760, 917999 + 1), + "Control character": range(32), + "Basic Latin": range(32, 128), + "Latin-1 Supplement": range(128, 256), + "Latin Extended-A": range(256, 384), + "Latin Extended-B": range(384, 592), + "IPA Extensions": range(592, 688), + "Spacing Modifier Letters": range(688, 768), + "Combining Diacritical Marks": range(768, 880), + "Greek and Coptic": range(880, 1024), + "Cyrillic": range(1024, 1280), + "Cyrillic Supplement": range(1280, 1328), + "Armenian": range(1328, 1424), + "Hebrew": range(1424, 1536), + "Arabic": range(1536, 1792), + "Syriac": range(1792, 1872), + "Arabic Supplement": range(1872, 1920), + "Thaana": range(1920, 1984), + "NKo": range(1984, 2048), + "Samaritan": range(2048, 2112), + "Mandaic": range(2112, 2144), + "Syriac Supplement": range(2144, 2160), + "Arabic Extended-B": range(2160, 2208), + "Arabic Extended-A": range(2208, 2304), + "Devanagari": range(2304, 2432), + "Bengali": range(2432, 2560), + "Gurmukhi": range(2560, 2688), + "Gujarati": range(2688, 2816), + "Oriya": range(2816, 2944), + "Tamil": range(2944, 3072), + "Telugu": range(3072, 3200), + "Kannada": range(3200, 3328), + "Malayalam": range(3328, 3456), + "Sinhala": range(3456, 3584), + "Thai": range(3584, 3712), + "Lao": range(3712, 3840), + "Tibetan": range(3840, 4096), + "Myanmar": range(4096, 4256), + "Georgian": range(4256, 4352), + "Hangul Jamo": range(4352, 4608), + "Ethiopic": range(4608, 4992), + "Ethiopic Supplement": range(4992, 5024), + "Cherokee": range(5024, 5120), + "Unified Canadian Aboriginal Syllabics": range(5120, 5760), + "Ogham": range(5760, 5792), + "Runic": range(5792, 5888), + "Tagalog": range(5888, 5920), + "Hanunoo": range(5920, 5952), + "Buhid": range(5952, 5984), + "Tagbanwa": range(5984, 6016), + "Khmer": range(6016, 6144), + "Mongolian": range(6144, 6320), + "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400), + "Limbu": range(6400, 6480), + "Tai Le": range(6480, 6528), + "New Tai Lue": range(6528, 6624), + "Khmer Symbols": range(6624, 6656), + "Buginese": range(6656, 6688), + "Tai Tham": range(6688, 6832), + "Combining Diacritical Marks Extended": range(6832, 6912), + "Balinese": range(6912, 7040), + "Sundanese": range(7040, 7104), + "Batak": range(7104, 7168), + "Lepcha": range(7168, 7248), + "Ol Chiki": range(7248, 7296), + "Cyrillic Extended-C": range(7296, 7312), + "Georgian Extended": range(7312, 7360), + "Sundanese Supplement": range(7360, 7376), + "Vedic Extensions": range(7376, 7424), + "Phonetic Extensions": range(7424, 7552), + "Phonetic Extensions Supplement": range(7552, 7616), + "Combining Diacritical Marks Supplement": range(7616, 7680), + "Latin Extended Additional": range(7680, 7936), + "Greek Extended": range(7936, 8192), + "General Punctuation": range(8192, 8304), + "Superscripts and Subscripts": range(8304, 8352), + "Currency Symbols": range(8352, 8400), + "Combining Diacritical Marks for Symbols": range(8400, 8448), + "Letterlike Symbols": range(8448, 8528), + "Number Forms": range(8528, 8592), + "Arrows": range(8592, 8704), + "Mathematical Operators": range(8704, 8960), + "Miscellaneous Technical": range(8960, 9216), + "Control Pictures": range(9216, 9280), + "Optical Character Recognition": range(9280, 9312), + "Enclosed Alphanumerics": range(9312, 9472), + "Box Drawing": range(9472, 9600), + "Block Elements": range(9600, 9632), + "Geometric Shapes": range(9632, 9728), + "Miscellaneous Symbols": range(9728, 9984), + "Dingbats": range(9984, 10176), + "Miscellaneous Mathematical Symbols-A": range(10176, 10224), + "Supplemental Arrows-A": range(10224, 10240), + "Braille Patterns": range(10240, 10496), + "Supplemental Arrows-B": range(10496, 10624), + "Miscellaneous Mathematical Symbols-B": range(10624, 10752), + "Supplemental Mathematical Operators": range(10752, 11008), + "Miscellaneous Symbols and Arrows": range(11008, 11264), + "Glagolitic": range(11264, 11360), + "Latin Extended-C": range(11360, 11392), + "Coptic": range(11392, 11520), + "Georgian Supplement": range(11520, 11568), + "Tifinagh": range(11568, 11648), + "Ethiopic Extended": range(11648, 11744), + "Cyrillic Extended-A": range(11744, 11776), + "Supplemental Punctuation": range(11776, 11904), + "CJK Radicals Supplement": range(11904, 12032), + "Kangxi Radicals": range(12032, 12256), + "Ideographic Description Characters": range(12272, 12288), + "CJK Symbols and Punctuation": range(12288, 12352), + "Hiragana": range(12352, 12448), + "Katakana": range(12448, 12544), + "Bopomofo": range(12544, 12592), + "Hangul Compatibility Jamo": range(12592, 12688), + "Kanbun": range(12688, 12704), + "Bopomofo Extended": range(12704, 12736), + "CJK Strokes": range(12736, 12784), + "Katakana Phonetic Extensions": range(12784, 12800), + "Enclosed CJK Letters and Months": range(12800, 13056), + "CJK Compatibility": range(13056, 13312), + "CJK Unified Ideographs Extension A": range(13312, 19904), + "Yijing Hexagram Symbols": range(19904, 19968), + "CJK Unified Ideographs": range(19968, 40960), + "Yi Syllables": range(40960, 42128), + "Yi Radicals": range(42128, 42192), + "Lisu": range(42192, 42240), + "Vai": range(42240, 42560), + "Cyrillic Extended-B": range(42560, 42656), + "Bamum": range(42656, 42752), + "Modifier Tone Letters": range(42752, 42784), + "Latin Extended-D": range(42784, 43008), + "Syloti Nagri": range(43008, 43056), + "Common Indic Number Forms": range(43056, 43072), + "Phags-pa": range(43072, 43136), + "Saurashtra": range(43136, 43232), + "Devanagari Extended": range(43232, 43264), + "Kayah Li": range(43264, 43312), + "Rejang": range(43312, 43360), + "Hangul Jamo Extended-A": range(43360, 43392), + "Javanese": range(43392, 43488), + "Myanmar Extended-B": range(43488, 43520), + "Cham": range(43520, 43616), + "Myanmar Extended-A": range(43616, 43648), + "Tai Viet": range(43648, 43744), + "Meetei Mayek Extensions": range(43744, 43776), + "Ethiopic Extended-A": range(43776, 43824), + "Latin Extended-E": range(43824, 43888), + "Cherokee Supplement": range(43888, 43968), + "Meetei Mayek": range(43968, 44032), + "Hangul Syllables": range(44032, 55216), + "Hangul Jamo Extended-B": range(55216, 55296), + "High Surrogates": range(55296, 56192), + "High Private Use Surrogates": range(56192, 56320), + "Low Surrogates": range(56320, 57344), + "Private Use Area": range(57344, 63744), + "CJK Compatibility Ideographs": range(63744, 64256), + "Alphabetic Presentation Forms": range(64256, 64336), + "Arabic Presentation Forms-A": range(64336, 65024), + "Variation Selectors": range(65024, 65040), + "Vertical Forms": range(65040, 65056), + "Combining Half Marks": range(65056, 65072), + "CJK Compatibility Forms": range(65072, 65104), + "Small Form Variants": range(65104, 65136), + "Arabic Presentation Forms-B": range(65136, 65280), + "Halfwidth and Fullwidth Forms": range(65280, 65520), + "Specials": range(65520, 65536), + "Linear B Syllabary": range(65536, 65664), + "Linear B Ideograms": range(65664, 65792), + "Aegean Numbers": range(65792, 65856), + "Ancient Greek Numbers": range(65856, 65936), + "Ancient Symbols": range(65936, 66000), + "Phaistos Disc": range(66000, 66048), + "Lycian": range(66176, 66208), + "Carian": range(66208, 66272), + "Coptic Epact Numbers": range(66272, 66304), + "Old Italic": range(66304, 66352), + "Gothic": range(66352, 66384), + "Old Permic": range(66384, 66432), + "Ugaritic": range(66432, 66464), + "Old Persian": range(66464, 66528), + "Deseret": range(66560, 66640), + "Shavian": range(66640, 66688), + "Osmanya": range(66688, 66736), + "Osage": range(66736, 66816), + "Elbasan": range(66816, 66864), + "Caucasian Albanian": range(66864, 66928), + "Vithkuqi": range(66928, 67008), + "Linear A": range(67072, 67456), + "Latin Extended-F": range(67456, 67520), + "Cypriot Syllabary": range(67584, 67648), + "Imperial Aramaic": range(67648, 67680), + "Palmyrene": range(67680, 67712), + "Nabataean": range(67712, 67760), + "Hatran": range(67808, 67840), + "Phoenician": range(67840, 67872), + "Lydian": range(67872, 67904), + "Meroitic Hieroglyphs": range(67968, 68000), + "Meroitic Cursive": range(68000, 68096), + "Kharoshthi": range(68096, 68192), + "Old South Arabian": range(68192, 68224), + "Old North Arabian": range(68224, 68256), + "Manichaean": range(68288, 68352), + "Avestan": range(68352, 68416), + "Inscriptional Parthian": range(68416, 68448), + "Inscriptional Pahlavi": range(68448, 68480), + "Psalter Pahlavi": range(68480, 68528), + "Old Turkic": range(68608, 68688), + "Old Hungarian": range(68736, 68864), + "Hanifi Rohingya": range(68864, 68928), + "Rumi Numeral Symbols": range(69216, 69248), + "Yezidi": range(69248, 69312), + "Arabic Extended-C": range(69312, 69376), + "Old Sogdian": range(69376, 69424), + "Sogdian": range(69424, 69488), + "Old Uyghur": range(69488, 69552), + "Chorasmian": range(69552, 69600), + "Elymaic": range(69600, 69632), + "Brahmi": range(69632, 69760), + "Kaithi": range(69760, 69840), + "Sora Sompeng": range(69840, 69888), + "Chakma": range(69888, 69968), + "Mahajani": range(69968, 70016), + "Sharada": range(70016, 70112), + "Sinhala Archaic Numbers": range(70112, 70144), + "Khojki": range(70144, 70224), + "Multani": range(70272, 70320), + "Khudawadi": range(70320, 70400), + "Grantha": range(70400, 70528), + "Newa": range(70656, 70784), + "Tirhuta": range(70784, 70880), + "Siddham": range(71040, 71168), + "Modi": range(71168, 71264), + "Mongolian Supplement": range(71264, 71296), + "Takri": range(71296, 71376), + "Ahom": range(71424, 71504), + "Dogra": range(71680, 71760), + "Warang Citi": range(71840, 71936), + "Dives Akuru": range(71936, 72032), + "Nandinagari": range(72096, 72192), + "Zanabazar Square": range(72192, 72272), + "Soyombo": range(72272, 72368), + "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384), + "Pau Cin Hau": range(72384, 72448), + "Devanagari Extended-A": range(72448, 72544), + "Bhaiksuki": range(72704, 72816), + "Marchen": range(72816, 72896), + "Masaram Gondi": range(72960, 73056), + "Gunjala Gondi": range(73056, 73136), + "Makasar": range(73440, 73472), + "Kawi": range(73472, 73568), + "Lisu Supplement": range(73648, 73664), + "Tamil Supplement": range(73664, 73728), + "Cuneiform": range(73728, 74752), + "Cuneiform Numbers and Punctuation": range(74752, 74880), + "Early Dynastic Cuneiform": range(74880, 75088), + "Cypro-Minoan": range(77712, 77824), + "Egyptian Hieroglyphs": range(77824, 78896), + "Egyptian Hieroglyph Format Controls": range(78896, 78944), + "Anatolian Hieroglyphs": range(82944, 83584), + "Bamum Supplement": range(92160, 92736), + "Mro": range(92736, 92784), + "Tangsa": range(92784, 92880), + "Bassa Vah": range(92880, 92928), + "Pahawh Hmong": range(92928, 93072), + "Medefaidrin": range(93760, 93856), + "Miao": range(93952, 94112), + "Ideographic Symbols and Punctuation": range(94176, 94208), + "Tangut": range(94208, 100352), + "Tangut Components": range(100352, 101120), + "Khitan Small Script": range(101120, 101632), + "Tangut Supplement": range(101632, 101760), + "Kana Extended-B": range(110576, 110592), + "Kana Supplement": range(110592, 110848), + "Kana Extended-A": range(110848, 110896), + "Small Kana Extension": range(110896, 110960), + "Nushu": range(110960, 111360), + "Duployan": range(113664, 113824), + "Shorthand Format Controls": range(113824, 113840), + "Znamenny Musical Notation": range(118528, 118736), + "Byzantine Musical Symbols": range(118784, 119040), + "Musical Symbols": range(119040, 119296), + "Ancient Greek Musical Notation": range(119296, 119376), + "Kaktovik Numerals": range(119488, 119520), + "Mayan Numerals": range(119520, 119552), + "Tai Xuan Jing Symbols": range(119552, 119648), + "Counting Rod Numerals": range(119648, 119680), + "Mathematical Alphanumeric Symbols": range(119808, 120832), + "Sutton SignWriting": range(120832, 121520), + "Latin Extended-G": range(122624, 122880), + "Glagolitic Supplement": range(122880, 122928), + "Cyrillic Extended-D": range(122928, 123024), + "Nyiakeng Puachue Hmong": range(123136, 123216), + "Toto": range(123536, 123584), + "Wancho": range(123584, 123648), + "Nag Mundari": range(124112, 124160), + "Ethiopic Extended-B": range(124896, 124928), + "Mende Kikakui": range(124928, 125152), + "Adlam": range(125184, 125280), + "Indic Siyaq Numbers": range(126064, 126144), + "Ottoman Siyaq Numbers": range(126208, 126288), + "Arabic Mathematical Alphabetic Symbols": range(126464, 126720), + "Mahjong Tiles": range(126976, 127024), + "Domino Tiles": range(127024, 127136), + "Playing Cards": range(127136, 127232), + "Enclosed Alphanumeric Supplement": range(127232, 127488), + "Enclosed Ideographic Supplement": range(127488, 127744), + "Miscellaneous Symbols and Pictographs": range(127744, 128512), + "Emoticons range(Emoji)": range(128512, 128592), + "Ornamental Dingbats": range(128592, 128640), + "Transport and Map Symbols": range(128640, 128768), + "Alchemical Symbols": range(128768, 128896), + "Geometric Shapes Extended": range(128896, 129024), + "Supplemental Arrows-C": range(129024, 129280), + "Supplemental Symbols and Pictographs": range(129280, 129536), + "Chess Symbols": range(129536, 129648), + "Symbols and Pictographs Extended-A": range(129648, 129792), + "Symbols for Legacy Computing": range(129792, 130048), + "CJK Unified Ideographs Extension B": range(131072, 173792), + "CJK Unified Ideographs Extension C": range(173824, 177984), + "CJK Unified Ideographs Extension D": range(177984, 178208), + "CJK Unified Ideographs Extension E": range(178208, 183984), + "CJK Unified Ideographs Extension F": range(183984, 191472), + "CJK Compatibility Ideographs Supplement": range(194560, 195104), + "CJK Unified Ideographs Extension G": range(196608, 201552), + "CJK Unified Ideographs Extension H": range(201552, 205744), + "Tags": range(917504, 917632), + "Variation Selectors Supplement": range(917760, 918000), + "Supplementary Private Use Area-A": range(983040, 1048576), + "Supplementary Private Use Area-B": range(1048576, 1114112), } @@ -331,11 +380,23 @@ RE_POSSIBLE_ENCODING_INDICATION = re_compile( IGNORECASE, ) +IANA_NO_ALIASES = [ + "cp720", + "cp737", + "cp856", + "cp874", + "cp875", + "cp1006", + "koi8_r", + "koi8_t", + "koi8_u", +] + IANA_SUPPORTED: List[str] = sorted( filter( lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, - list(set(aliases.values())), + list(set(aliases.values())) + IANA_NO_ALIASES, ) ) @@ -489,7 +550,1446 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = { KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} -LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) - # Logging LEVEL below DEBUG TRACE: int = 5 + + +# Language label that contain the em dash "—" +# character are to be considered alternative seq to origin +FREQUENCIES: Dict[str, List[str]] = { + "English": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "u", + "m", + "f", + "p", + "g", + "w", + "y", + "b", + "v", + "k", + "x", + "j", + "z", + "q", + ], + "English—": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "m", + "u", + "f", + "p", + "g", + "w", + "b", + "y", + "v", + "k", + "j", + "x", + "z", + "q", + ], + "German": [ + "e", + "n", + "i", + "r", + "s", + "t", + "a", + "d", + "h", + "u", + "l", + "g", + "o", + "c", + "m", + "b", + "f", + "k", + "w", + "z", + "p", + "v", + "ü", + "ä", + "ö", + "j", + ], + "French": [ + "e", + "a", + "s", + "n", + "i", + "t", + "r", + "l", + "u", + "o", + "d", + "c", + "p", + "m", + "é", + "v", + "g", + "f", + "b", + "h", + "q", + "à", + "x", + "è", + "y", + "j", + ], + "Dutch": [ + "e", + "n", + "a", + "i", + "r", + "t", + "o", + "d", + "s", + "l", + "g", + "h", + "v", + "m", + "u", + "k", + "c", + "p", + "b", + "w", + "j", + "z", + "f", + "y", + "x", + "ë", + ], + "Italian": [ + "e", + "i", + "a", + "o", + "n", + "l", + "t", + "r", + "s", + "c", + "d", + "u", + "p", + "m", + "g", + "v", + "f", + "b", + "z", + "h", + "q", + "è", + "à", + "k", + "y", + "ò", + ], + "Polish": [ + "a", + "i", + "o", + "e", + "n", + "r", + "z", + "w", + "s", + "c", + "t", + "k", + "y", + "d", + "p", + "m", + "u", + "l", + "j", + "ł", + "g", + "b", + "h", + "ą", + "ę", + "ó", + ], + "Spanish": [ + "e", + "a", + "o", + "n", + "s", + "r", + "i", + "l", + "d", + "t", + "c", + "u", + "m", + "p", + "b", + "g", + "v", + "f", + "y", + "ó", + "h", + "q", + "í", + "j", + "z", + "á", + ], + "Russian": [ + "о", + "а", + "е", + "и", + "н", + "с", + "т", + "р", + "в", + "л", + "к", + "м", + "д", + "п", + "у", + "г", + "я", + "ы", + "з", + "б", + "й", + "ь", + "ч", + "х", + "ж", + "ц", + ], + # Jap-Kanji + "Japanese": [ + "人", + "一", + "大", + "亅", + "丁", + "丨", + "竹", + "笑", + "口", + "日", + "今", + "二", + "彳", + "行", + "十", + "土", + "丶", + "寸", + "寺", + "時", + "乙", + "丿", + "乂", + "气", + "気", + "冂", + "巾", + "亠", + "市", + "目", + "儿", + "見", + "八", + "小", + "凵", + "県", + "月", + "彐", + "門", + "間", + "木", + "東", + "山", + "出", + "本", + "中", + "刀", + "分", + "耳", + "又", + "取", + "最", + "言", + "田", + "心", + "思", + "刂", + "前", + "京", + "尹", + "事", + "生", + "厶", + "云", + "会", + "未", + "来", + "白", + "冫", + "楽", + "灬", + "馬", + "尸", + "尺", + "駅", + "明", + "耂", + "者", + "了", + "阝", + "都", + "高", + "卜", + "占", + "厂", + "广", + "店", + "子", + "申", + "奄", + "亻", + "俺", + "上", + "方", + "冖", + "学", + "衣", + "艮", + "食", + "自", + ], + # Jap-Katakana + "Japanese—": [ + "ー", + "ン", + "ス", + "・", + "ル", + "ト", + "リ", + "イ", + "ア", + "ラ", + "ッ", + "ク", + "ド", + "シ", + "レ", + "ジ", + "タ", + "フ", + "ロ", + "カ", + "テ", + "マ", + "ィ", + "グ", + "バ", + "ム", + "プ", + "オ", + "コ", + "デ", + "ニ", + "ウ", + "メ", + "サ", + "ビ", + "ナ", + "ブ", + "ャ", + "エ", + "ュ", + "チ", + "キ", + "ズ", + "ダ", + "パ", + "ミ", + "ェ", + "ョ", + "ハ", + "セ", + "ベ", + "ガ", + "モ", + "ツ", + "ネ", + "ボ", + "ソ", + "ノ", + "ァ", + "ヴ", + "ワ", + "ポ", + "ペ", + "ピ", + "ケ", + "ゴ", + "ギ", + "ザ", + "ホ", + "ゲ", + "ォ", + "ヤ", + "ヒ", + "ユ", + "ヨ", + "ヘ", + "ゼ", + "ヌ", + "ゥ", + "ゾ", + "ヶ", + "ヂ", + "ヲ", + "ヅ", + "ヵ", + "ヱ", + "ヰ", + "ヮ", + "ヽ", + "゠", + "ヾ", + "ヷ", + "ヿ", + "ヸ", + "ヹ", + "ヺ", + ], + # Jap-Hiragana + "Japanese——": [ + "の", + "に", + "る", + "た", + "と", + "は", + "し", + "い", + "を", + "で", + "て", + "が", + "な", + "れ", + "か", + "ら", + "さ", + "っ", + "り", + "す", + "あ", + "も", + "こ", + "ま", + "う", + "く", + "よ", + "き", + "ん", + "め", + "お", + "け", + "そ", + "つ", + "だ", + "や", + "え", + "ど", + "わ", + "ち", + "み", + "せ", + "じ", + "ば", + "へ", + "び", + "ず", + "ろ", + "ほ", + "げ", + "む", + "べ", + "ひ", + "ょ", + "ゆ", + "ぶ", + "ご", + "ゃ", + "ね", + "ふ", + "ぐ", + "ぎ", + "ぼ", + "ゅ", + "づ", + "ざ", + "ぞ", + "ぬ", + "ぜ", + "ぱ", + "ぽ", + "ぷ", + "ぴ", + "ぃ", + "ぁ", + "ぇ", + "ぺ", + "ゞ", + "ぢ", + "ぉ", + "ぅ", + "ゐ", + "ゝ", + "ゑ", + "゛", + "゜", + "ゎ", + "ゔ", + "゚", + "ゟ", + "゙", + "ゕ", + "ゖ", + ], + "Portuguese": [ + "a", + "e", + "o", + "s", + "i", + "r", + "d", + "n", + "t", + "m", + "u", + "c", + "l", + "p", + "g", + "v", + "b", + "f", + "h", + "ã", + "q", + "é", + "ç", + "á", + "z", + "í", + ], + "Swedish": [ + "e", + "a", + "n", + "r", + "t", + "s", + "i", + "l", + "d", + "o", + "m", + "k", + "g", + "v", + "h", + "f", + "u", + "p", + "ä", + "c", + "b", + "ö", + "å", + "y", + "j", + "x", + ], + "Chinese": [ + "的", + "一", + "是", + "不", + "了", + "在", + "人", + "有", + "我", + "他", + "这", + "个", + "们", + "中", + "来", + "上", + "大", + "为", + "和", + "国", + "地", + "到", + "以", + "说", + "时", + "要", + "就", + "出", + "会", + "可", + "也", + "你", + "对", + "生", + "能", + "而", + "子", + "那", + "得", + "于", + "着", + "下", + "自", + "之", + "年", + "过", + "发", + "后", + "作", + "里", + "用", + "道", + "行", + "所", + "然", + "家", + "种", + "事", + "成", + "方", + "多", + "经", + "么", + "去", + "法", + "学", + "如", + "都", + "同", + "现", + "当", + "没", + "动", + "面", + "起", + "看", + "定", + "天", + "分", + "还", + "进", + "好", + "小", + "部", + "其", + "些", + "主", + "样", + "理", + "心", + "她", + "本", + "前", + "开", + "但", + "因", + "只", + "从", + "想", + "实", + ], + "Ukrainian": [ + "о", + "а", + "н", + "і", + "и", + "р", + "в", + "т", + "е", + "с", + "к", + "л", + "у", + "д", + "м", + "п", + "з", + "я", + "ь", + "б", + "г", + "й", + "ч", + "х", + "ц", + "ї", + ], + "Norwegian": [ + "e", + "r", + "n", + "t", + "a", + "s", + "i", + "o", + "l", + "d", + "g", + "k", + "m", + "v", + "f", + "p", + "u", + "b", + "h", + "å", + "y", + "j", + "ø", + "c", + "æ", + "w", + ], + "Finnish": [ + "a", + "i", + "n", + "t", + "e", + "s", + "l", + "o", + "u", + "k", + "ä", + "m", + "r", + "v", + "j", + "h", + "p", + "y", + "d", + "ö", + "g", + "c", + "b", + "f", + "w", + "z", + ], + "Vietnamese": [ + "n", + "h", + "t", + "i", + "c", + "g", + "a", + "o", + "u", + "m", + "l", + "r", + "à", + "đ", + "s", + "e", + "v", + "p", + "b", + "y", + "ư", + "d", + "á", + "k", + "ộ", + "ế", + ], + "Czech": [ + "o", + "e", + "a", + "n", + "t", + "s", + "i", + "l", + "v", + "r", + "k", + "d", + "u", + "m", + "p", + "í", + "c", + "h", + "z", + "á", + "y", + "j", + "b", + "ě", + "é", + "ř", + ], + "Hungarian": [ + "e", + "a", + "t", + "l", + "s", + "n", + "k", + "r", + "i", + "o", + "z", + "á", + "é", + "g", + "m", + "b", + "y", + "v", + "d", + "h", + "u", + "p", + "j", + "ö", + "f", + "c", + ], + "Korean": [ + "이", + "다", + "에", + "의", + "는", + "로", + "하", + "을", + "가", + "고", + "지", + "서", + "한", + "은", + "기", + "으", + "년", + "대", + "사", + "시", + "를", + "리", + "도", + "인", + "스", + "일", + ], + "Indonesian": [ + "a", + "n", + "e", + "i", + "r", + "t", + "u", + "s", + "d", + "k", + "m", + "l", + "g", + "p", + "b", + "o", + "h", + "y", + "j", + "c", + "w", + "f", + "v", + "z", + "x", + "q", + ], + "Turkish": [ + "a", + "e", + "i", + "n", + "r", + "l", + "ı", + "k", + "d", + "t", + "s", + "m", + "y", + "u", + "o", + "b", + "ü", + "ş", + "v", + "g", + "z", + "h", + "c", + "p", + "ç", + "ğ", + ], + "Romanian": [ + "e", + "i", + "a", + "r", + "n", + "t", + "u", + "l", + "o", + "c", + "s", + "d", + "p", + "m", + "ă", + "f", + "v", + "î", + "g", + "b", + "ș", + "ț", + "z", + "h", + "â", + "j", + ], + "Farsi": [ + "ا", + "ی", + "ر", + "د", + "ن", + "ه", + "و", + "م", + "ت", + "ب", + "س", + "ل", + "ک", + "ش", + "ز", + "ف", + "گ", + "ع", + "خ", + "ق", + "ج", + "آ", + "پ", + "ح", + "ط", + "ص", + ], + "Arabic": [ + "ا", + "ل", + "ي", + "م", + "و", + "ن", + "ر", + "ت", + "ب", + "ة", + "ع", + "د", + "س", + "ف", + "ه", + "ك", + "ق", + "أ", + "ح", + "ج", + "ش", + "ط", + "ص", + "ى", + "خ", + "إ", + ], + "Danish": [ + "e", + "r", + "n", + "t", + "a", + "i", + "s", + "d", + "l", + "o", + "g", + "m", + "k", + "f", + "v", + "u", + "b", + "h", + "p", + "å", + "y", + "ø", + "æ", + "c", + "j", + "w", + ], + "Serbian": [ + "а", + "и", + "о", + "е", + "н", + "р", + "с", + "у", + "т", + "к", + "ј", + "в", + "д", + "м", + "п", + "л", + "г", + "з", + "б", + "a", + "i", + "e", + "o", + "n", + "ц", + "ш", + ], + "Lithuanian": [ + "i", + "a", + "s", + "o", + "r", + "e", + "t", + "n", + "u", + "k", + "m", + "l", + "p", + "v", + "d", + "j", + "g", + "ė", + "b", + "y", + "ų", + "š", + "ž", + "c", + "ą", + "į", + ], + "Slovene": [ + "e", + "a", + "i", + "o", + "n", + "r", + "s", + "l", + "t", + "j", + "v", + "k", + "d", + "p", + "m", + "u", + "z", + "b", + "g", + "h", + "č", + "c", + "š", + "ž", + "f", + "y", + ], + "Slovak": [ + "o", + "a", + "e", + "n", + "i", + "r", + "v", + "t", + "s", + "l", + "k", + "d", + "m", + "p", + "u", + "c", + "h", + "j", + "b", + "z", + "á", + "y", + "ý", + "í", + "č", + "é", + ], + "Hebrew": [ + "י", + "ו", + "ה", + "ל", + "ר", + "ב", + "ת", + "מ", + "א", + "ש", + "נ", + "ע", + "ם", + "ד", + "ק", + "ח", + "פ", + "ס", + "כ", + "ג", + "ט", + "צ", + "ן", + "ז", + "ך", + ], + "Bulgarian": [ + "а", + "и", + "о", + "е", + "н", + "т", + "р", + "с", + "в", + "л", + "к", + "д", + "п", + "м", + "з", + "г", + "я", + "ъ", + "у", + "б", + "ч", + "ц", + "й", + "ж", + "щ", + "х", + ], + "Croatian": [ + "a", + "i", + "o", + "e", + "n", + "r", + "j", + "s", + "t", + "u", + "k", + "l", + "v", + "d", + "m", + "p", + "g", + "z", + "b", + "c", + "č", + "h", + "š", + "ž", + "ć", + "f", + ], + "Hindi": [ + "क", + "र", + "स", + "न", + "त", + "म", + "ह", + "प", + "य", + "ल", + "व", + "ज", + "द", + "ग", + "ब", + "श", + "ट", + "अ", + "ए", + "थ", + "भ", + "ड", + "च", + "ध", + "ष", + "इ", + ], + "Estonian": [ + "a", + "i", + "e", + "s", + "t", + "l", + "u", + "n", + "o", + "k", + "r", + "d", + "m", + "v", + "g", + "p", + "j", + "h", + "ä", + "b", + "õ", + "ü", + "f", + "c", + "ö", + "y", + ], + "Thai": [ + "า", + "น", + "ร", + "อ", + "ก", + "เ", + "ง", + "ม", + "ย", + "ล", + "ว", + "ด", + "ท", + "ส", + "ต", + "ะ", + "ป", + "บ", + "ค", + "ห", + "แ", + "จ", + "พ", + "ช", + "ข", + "ใ", + ], + "Greek": [ + "α", + "τ", + "ο", + "ι", + "ε", + "ν", + "ρ", + "σ", + "κ", + "η", + "π", + "ς", + "υ", + "μ", + "λ", + "ί", + "ό", + "ά", + "γ", + "έ", + "δ", + "ή", + "ω", + "χ", + "θ", + "ύ", + ], + "Tamil": [ + "க", + "த", + "ப", + "ட", + "ர", + "ம", + "ல", + "ன", + "வ", + "ற", + "ய", + "ள", + "ச", + "ந", + "இ", + "ண", + "அ", + "ஆ", + "ழ", + "ங", + "எ", + "உ", + "ஒ", + "ஸ", + ], + "Kazakh": [ + "а", + "ы", + "е", + "н", + "т", + "р", + "л", + "і", + "д", + "с", + "м", + "қ", + "к", + "о", + "б", + "и", + "у", + "ғ", + "ж", + "ң", + "з", + "ш", + "й", + "п", + "г", + "ө", + ], +} + +LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) diff --git a/lib/charset_normalizer/md.py b/lib/charset_normalizer/md.py index 13aa062e..77897aae 100644 --- a/lib/charset_normalizer/md.py +++ b/lib/charset_normalizer/md.py @@ -9,7 +9,8 @@ from .constant import ( ) from .utils import ( is_accentuated, - is_ascii, + is_arabic, + is_arabic_isolated_form, is_case_variable, is_cjk, is_emoticon, @@ -128,8 +129,9 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin): @property def ratio(self) -> float: - if self._character_count == 0 or self._character_count < 8: + if self._character_count < 8: return 0.0 + ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 @@ -234,16 +236,13 @@ class SuspiciousRange(MessDetectorPlugin): @property def ratio(self) -> float: - if self._character_count == 0: + if self._character_count <= 24: return 0.0 ratio_of_suspicious_range_usage: float = ( self._suspicious_successive_range_count * 2 ) / self._character_count - if ratio_of_suspicious_range_usage < 0.1: - return 0.0 - return ratio_of_suspicious_range_usage @@ -296,7 +295,11 @@ class SuperWeirdWordPlugin(MessDetectorPlugin): self._is_current_word_bad = True # Word/Buffer ending with an upper case accentuated letter are so rare, # that we will consider them all as suspicious. Same weight as foreign_long suspicious. - if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): + if ( + is_accentuated(self._buffer[-1]) + and self._buffer[-1].isupper() + and all(_.isupper() for _ in self._buffer) is False + ): self._foreign_long_count += 1 self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: @@ -419,7 +422,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin): return - if self._current_ascii_only is True and is_ascii(character) is False: + if self._current_ascii_only is True and character.isascii() is False: self._current_ascii_only = False if self._last_alpha_seen is not None: @@ -455,6 +458,34 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin): return self._successive_upper_lower_count_final / self._character_count +class ArabicIsolatedFormPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._character_count: int = 0 + self._isolated_form_count: int = 0 + + def reset(self) -> None: # pragma: no cover + self._character_count = 0 + self._isolated_form_count = 0 + + def eligible(self, character: str) -> bool: + return is_arabic(character) + + def feed(self, character: str) -> None: + self._character_count += 1 + + if is_arabic_isolated_form(character): + self._isolated_form_count += 1 + + @property + def ratio(self) -> float: + if self._character_count < 8: + return 0.0 + + isolated_form_usage: float = self._isolated_form_count / self._character_count + + return isolated_form_usage + + @lru_cache(maxsize=1024) def is_suspiciously_successive_range( unicode_range_a: Optional[str], unicode_range_b: Optional[str] @@ -522,6 +553,8 @@ def is_suspiciously_successive_range( return False if "Forms" in unicode_range_a or "Forms" in unicode_range_b: return False + if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": + return False return True diff --git a/lib/charset_normalizer/models.py b/lib/charset_normalizer/models.py index 7f8ca389..a760b9c5 100644 --- a/lib/charset_normalizer/models.py +++ b/lib/charset_normalizer/models.py @@ -54,16 +54,19 @@ class CharsetMatch: # Below 1% difference --> Use Coherence if chaos_difference < 0.01 and coherence_difference > 0.02: - # When having a tough decision, use the result that decoded as many multi-byte as possible. - if chaos_difference == 0.0 and self.coherence == other.coherence: - return self.multi_byte_usage > other.multi_byte_usage return self.coherence > other.coherence + elif chaos_difference < 0.01 and coherence_difference <= 0.02: + # When having a difficult decision, use the result that decoded as many multi-byte as possible. + # preserve RAM usage! + if len(self._payload) >= TOO_BIG_SEQUENCE: + return self.chaos < other.chaos + return self.multi_byte_usage > other.multi_byte_usage return self.chaos < other.chaos @property def multi_byte_usage(self) -> float: - return 1.0 - len(str(self)) / len(self.raw) + return 1.0 - (len(str(self)) / len(self.raw)) def __str__(self) -> str: # Lazy Str Loading diff --git a/lib/charset_normalizer/utils.py b/lib/charset_normalizer/utils.py index bf2767a0..e5cbbf4c 100644 --- a/lib/charset_normalizer/utils.py +++ b/lib/charset_normalizer/utils.py @@ -32,6 +32,8 @@ def is_accentuated(character: str) -> bool: or "WITH DIAERESIS" in description or "WITH CIRCUMFLEX" in description or "WITH TILDE" in description + or "WITH MACRON" in description + or "WITH RING ABOVE" in description ) @@ -69,15 +71,6 @@ def is_latin(character: str) -> bool: return "LATIN" in description -@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) -def is_ascii(character: str) -> bool: - try: - character.encode("ascii") - except UnicodeEncodeError: - return False - return True - - @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_punctuation(character: str) -> bool: character_category: str = unicodedata.category(character) @@ -105,7 +98,7 @@ def is_symbol(character: str) -> bool: if character_range is None: return False - return "Forms" in character_range + return "Forms" in character_range and character_category != "Lo" @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -115,7 +108,7 @@ def is_emoticon(character: str) -> bool: if character_range is None: return False - return "Emoticons" in character_range + return "Emoticons" in character_range or "Pictographs" in character_range @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -133,12 +126,6 @@ def is_case_variable(character: str) -> bool: return character.islower() != character.isupper() -def is_private_use_only(character: str) -> bool: - character_category: str = unicodedata.category(character) - - return character_category == "Co" - - @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_cjk(character: str) -> bool: try: @@ -189,6 +176,26 @@ def is_thai(character: str) -> bool: return "THAI" in character_name +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_arabic(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "ARABIC" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_arabic_isolated_form(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "ARABIC" in character_name and "ISOLATED FORM" in character_name + + @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) def is_unicode_range_secondary(range_name: str) -> bool: return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) @@ -205,7 +212,7 @@ def is_unprintable(character: str) -> bool: ) -def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: +def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]: """ Extract using ASCII-only decoder any specified encoding in the first n-bytes. """ diff --git a/lib/charset_normalizer/version.py b/lib/charset_normalizer/version.py index 5eed49a4..5a4da4ff 100644 --- a/lib/charset_normalizer/version.py +++ b/lib/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.2.0" +__version__ = "3.3.2" VERSION = __version__.split(".") diff --git a/lib/idna/codec.py b/lib/idna/codec.py index 1ca9ba62..c855a4de 100644 --- a/lib/idna/codec.py +++ b/lib/idna/codec.py @@ -1,7 +1,7 @@ from .core import encode, decode, alabel, ulabel, IDNAError import codecs import re -from typing import Tuple, Optional +from typing import Any, Tuple, Optional _unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]') @@ -26,24 +26,24 @@ class Codec(codecs.Codec): return decode(data), len(data) class IncrementalEncoder(codecs.BufferedIncrementalEncoder): - def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[str, int]: # type: ignore + def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[bytes, int]: if errors != 'strict': raise IDNAError('Unsupported error handling \"{}\"'.format(errors)) if not data: - return "", 0 + return b'', 0 labels = _unicode_dots_re.split(data) - trailing_dot = '' + trailing_dot = b'' if labels: if not labels[-1]: - trailing_dot = '.' + trailing_dot = b'.' del labels[-1] elif not final: # Keep potentially unfinished label until the next call del labels[-1] if labels: - trailing_dot = '.' + trailing_dot = b'.' result = [] size = 0 @@ -54,18 +54,21 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder): size += len(label) # Join with U+002E - result_str = '.'.join(result) + trailing_dot # type: ignore + result_bytes = b'.'.join(result) + trailing_dot size += len(trailing_dot) - return result_str, size + return result_bytes, size class IncrementalDecoder(codecs.BufferedIncrementalDecoder): - def _buffer_decode(self, data: str, errors: str, final: bool) -> Tuple[str, int]: # type: ignore + def _buffer_decode(self, data: Any, errors: str, final: bool) -> Tuple[str, int]: if errors != 'strict': raise IDNAError('Unsupported error handling \"{}\"'.format(errors)) if not data: return ('', 0) + if not isinstance(data, str): + data = str(data, 'ascii') + labels = _unicode_dots_re.split(data) trailing_dot = '' if labels: @@ -99,14 +102,17 @@ class StreamReader(Codec, codecs.StreamReader): pass -def getregentry() -> codecs.CodecInfo: - # Compatibility as a search_function for codecs.register() +def search_function(name: str) -> Optional[codecs.CodecInfo]: + if name != 'idna2008': + return None return codecs.CodecInfo( - name='idna', - encode=Codec().encode, # type: ignore - decode=Codec().decode, # type: ignore + name=name, + encode=Codec().encode, + decode=Codec().decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, ) + +codecs.register(search_function) diff --git a/lib/idna/core.py b/lib/idna/core.py index 4f300371..aaf7d658 100644 --- a/lib/idna/core.py +++ b/lib/idna/core.py @@ -318,7 +318,7 @@ def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False status = uts46row[1] replacement = None # type: Optional[str] if len(uts46row) == 3: - replacement = uts46row[2] # type: ignore + replacement = uts46row[2] if (status == 'V' or (status == 'D' and not transitional) or (status == '3' and not std3_rules and replacement is None)): @@ -338,9 +338,9 @@ def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes: - if isinstance(s, (bytes, bytearray)): + if not isinstance(s, str): try: - s = s.decode('ascii') + s = str(s, 'ascii') except UnicodeDecodeError: raise IDNAError('should pass a unicode string to the function rather than a byte string.') if uts46: @@ -372,8 +372,8 @@ def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str: try: - if isinstance(s, (bytes, bytearray)): - s = s.decode('ascii') + if not isinstance(s, str): + s = str(s, 'ascii') except UnicodeDecodeError: raise IDNAError('Invalid ASCII in A-label') if uts46: diff --git a/lib/idna/idnadata.py b/lib/idna/idnadata.py index 67db4625..5cd05d90 100644 --- a/lib/idna/idnadata.py +++ b/lib/idna/idnadata.py @@ -1,6 +1,6 @@ # This file is automatically generated by tools/idna-data -__version__ = '15.0.0' +__version__ = '15.1.0' scripts = { 'Greek': ( 0x37000000374, @@ -59,6 +59,7 @@ scripts = { 0x2b7400002b81e, 0x2b8200002cea2, 0x2ceb00002ebe1, + 0x2ebf00002ee5e, 0x2f8000002fa1e, 0x300000003134b, 0x31350000323b0, @@ -1834,7 +1835,6 @@ codepoint_classes = { 0xa7d50000a7d6, 0xa7d70000a7d8, 0xa7d90000a7da, - 0xa7f20000a7f5, 0xa7f60000a7f8, 0xa7fa0000a828, 0xa82c0000a82d, @@ -1907,9 +1907,7 @@ codepoint_classes = { 0x1060000010737, 0x1074000010756, 0x1076000010768, - 0x1078000010786, - 0x10787000107b1, - 0x107b2000107bb, + 0x1078000010781, 0x1080000010806, 0x1080800010809, 0x1080a00010836, @@ -2134,6 +2132,7 @@ codepoint_classes = { 0x2b7400002b81e, 0x2b8200002cea2, 0x2ceb00002ebe1, + 0x2ebf00002ee5e, 0x300000003134b, 0x31350000323b0, ), diff --git a/lib/idna/package_data.py b/lib/idna/package_data.py index 8501893b..c5b7220c 100644 --- a/lib/idna/package_data.py +++ b/lib/idna/package_data.py @@ -1,2 +1,2 @@ -__version__ = '3.4' +__version__ = '3.6' diff --git a/lib/idna/uts46data.py b/lib/idna/uts46data.py index 186796c1..6a1eddbf 100644 --- a/lib/idna/uts46data.py +++ b/lib/idna/uts46data.py @@ -7,7 +7,7 @@ from typing import List, Tuple, Union """IDNA Mapping Table from UTS46.""" -__version__ = '15.0.0' +__version__ = '15.1.0' def _seg_0() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: return [ (0x0, '3'), @@ -1899,7 +1899,7 @@ def _seg_18() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1E9A, 'M', 'aʾ'), (0x1E9B, 'M', 'ṡ'), (0x1E9C, 'V'), - (0x1E9E, 'M', 'ss'), + (0x1E9E, 'M', 'ß'), (0x1E9F, 'V'), (0x1EA0, 'M', 'ạ'), (0x1EA1, 'V'), @@ -2418,10 +2418,6 @@ def _seg_23() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x222F, 'M', '∮∮'), (0x2230, 'M', '∮∮∮'), (0x2231, 'V'), - (0x2260, '3'), - (0x2261, 'V'), - (0x226E, '3'), - (0x2270, 'V'), (0x2329, 'M', '〈'), (0x232A, 'M', '〉'), (0x232B, 'V'), @@ -2502,14 +2498,14 @@ def _seg_23() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x24BA, 'M', 'e'), (0x24BB, 'M', 'f'), (0x24BC, 'M', 'g'), - ] - -def _seg_24() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x24BD, 'M', 'h'), (0x24BE, 'M', 'i'), (0x24BF, 'M', 'j'), (0x24C0, 'M', 'k'), + ] + +def _seg_24() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x24C1, 'M', 'l'), (0x24C2, 'M', 'm'), (0x24C3, 'M', 'n'), @@ -2606,14 +2602,14 @@ def _seg_24() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2C26, 'M', 'ⱖ'), (0x2C27, 'M', 'ⱗ'), (0x2C28, 'M', 'ⱘ'), - ] - -def _seg_25() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x2C29, 'M', 'ⱙ'), (0x2C2A, 'M', 'ⱚ'), (0x2C2B, 'M', 'ⱛ'), (0x2C2C, 'M', 'ⱜ'), + ] + +def _seg_25() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x2C2D, 'M', 'ⱝ'), (0x2C2E, 'M', 'ⱞ'), (0x2C2F, 'M', 'ⱟ'), @@ -2710,14 +2706,14 @@ def _seg_25() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2CC0, 'M', 'ⳁ'), (0x2CC1, 'V'), (0x2CC2, 'M', 'ⳃ'), - ] - -def _seg_26() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x2CC3, 'V'), (0x2CC4, 'M', 'ⳅ'), (0x2CC5, 'V'), (0x2CC6, 'M', 'ⳇ'), + ] + +def _seg_26() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x2CC7, 'V'), (0x2CC8, 'M', 'ⳉ'), (0x2CC9, 'V'), @@ -2814,14 +2810,14 @@ def _seg_26() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2F13, 'M', '勹'), (0x2F14, 'M', '匕'), (0x2F15, 'M', '匚'), - ] - -def _seg_27() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x2F16, 'M', '匸'), (0x2F17, 'M', '十'), (0x2F18, 'M', '卜'), (0x2F19, 'M', '卩'), + ] + +def _seg_27() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x2F1A, 'M', '厂'), (0x2F1B, 'M', '厶'), (0x2F1C, 'M', '又'), @@ -2918,14 +2914,14 @@ def _seg_27() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2F77, 'M', '糸'), (0x2F78, 'M', '缶'), (0x2F79, 'M', '网'), - ] - -def _seg_28() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x2F7A, 'M', '羊'), (0x2F7B, 'M', '羽'), (0x2F7C, 'M', '老'), (0x2F7D, 'M', '而'), + ] + +def _seg_28() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x2F7E, 'M', '耒'), (0x2F7F, 'M', '耳'), (0x2F80, 'M', '聿'), @@ -3022,14 +3018,14 @@ def _seg_28() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x3036, 'M', '〒'), (0x3037, 'V'), (0x3038, 'M', '十'), - ] - -def _seg_29() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x3039, 'M', '卄'), (0x303A, 'M', '卅'), (0x303B, 'V'), (0x3040, 'X'), + ] + +def _seg_29() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x3041, 'V'), (0x3097, 'X'), (0x3099, 'V'), @@ -3126,14 +3122,14 @@ def _seg_29() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x3182, 'M', 'ᇱ'), (0x3183, 'M', 'ᇲ'), (0x3184, 'M', 'ᅗ'), - ] - -def _seg_30() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x3185, 'M', 'ᅘ'), (0x3186, 'M', 'ᅙ'), (0x3187, 'M', 'ᆄ'), (0x3188, 'M', 'ᆅ'), + ] + +def _seg_30() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x3189, 'M', 'ᆈ'), (0x318A, 'M', 'ᆑ'), (0x318B, 'M', 'ᆒ'), @@ -3230,14 +3226,14 @@ def _seg_30() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x3244, 'M', '問'), (0x3245, 'M', '幼'), (0x3246, 'M', '文'), - ] - -def _seg_31() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x3247, 'M', '箏'), (0x3248, 'V'), (0x3250, 'M', 'pte'), (0x3251, 'M', '21'), + ] + +def _seg_31() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x3252, 'M', '22'), (0x3253, 'M', '23'), (0x3254, 'M', '24'), @@ -3334,14 +3330,14 @@ def _seg_31() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x32AF, 'M', '協'), (0x32B0, 'M', '夜'), (0x32B1, 'M', '36'), - ] - -def _seg_32() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x32B2, 'M', '37'), (0x32B3, 'M', '38'), (0x32B4, 'M', '39'), (0x32B5, 'M', '40'), + ] + +def _seg_32() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x32B6, 'M', '41'), (0x32B7, 'M', '42'), (0x32B8, 'M', '43'), @@ -3438,14 +3434,14 @@ def _seg_32() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x3313, 'M', 'ギルダー'), (0x3314, 'M', 'キロ'), (0x3315, 'M', 'キログラム'), - ] - -def _seg_33() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x3316, 'M', 'キロメートル'), (0x3317, 'M', 'キロワット'), (0x3318, 'M', 'グラム'), (0x3319, 'M', 'グラムトン'), + ] + +def _seg_33() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x331A, 'M', 'クルゼイロ'), (0x331B, 'M', 'クローネ'), (0x331C, 'M', 'ケース'), @@ -3542,14 +3538,14 @@ def _seg_33() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x3377, 'M', 'dm'), (0x3378, 'M', 'dm2'), (0x3379, 'M', 'dm3'), - ] - -def _seg_34() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x337A, 'M', 'iu'), (0x337B, 'M', '平成'), (0x337C, 'M', '昭和'), (0x337D, 'M', '大正'), + ] + +def _seg_34() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x337E, 'M', '明治'), (0x337F, 'M', '株式会社'), (0x3380, 'M', 'pa'), @@ -3646,14 +3642,14 @@ def _seg_34() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x33DB, 'M', 'sr'), (0x33DC, 'M', 'sv'), (0x33DD, 'M', 'wb'), - ] - -def _seg_35() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x33DE, 'M', 'v∕m'), (0x33DF, 'M', 'a∕m'), (0x33E0, 'M', '1日'), (0x33E1, 'M', '2日'), + ] + +def _seg_35() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x33E2, 'M', '3日'), (0x33E3, 'M', '4日'), (0x33E4, 'M', '5日'), @@ -3750,14 +3746,14 @@ def _seg_35() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xA68B, 'V'), (0xA68C, 'M', 'ꚍ'), (0xA68D, 'V'), - ] - -def _seg_36() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xA68E, 'M', 'ꚏ'), (0xA68F, 'V'), (0xA690, 'M', 'ꚑ'), (0xA691, 'V'), + ] + +def _seg_36() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xA692, 'M', 'ꚓ'), (0xA693, 'V'), (0xA694, 'M', 'ꚕ'), @@ -3854,14 +3850,14 @@ def _seg_36() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xA779, 'M', 'ꝺ'), (0xA77A, 'V'), (0xA77B, 'M', 'ꝼ'), - ] - -def _seg_37() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xA77C, 'V'), (0xA77D, 'M', 'ᵹ'), (0xA77E, 'M', 'ꝿ'), (0xA77F, 'V'), + ] + +def _seg_37() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xA780, 'M', 'ꞁ'), (0xA781, 'V'), (0xA782, 'M', 'ꞃ'), @@ -3958,14 +3954,14 @@ def _seg_37() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xA878, 'X'), (0xA880, 'V'), (0xA8C6, 'X'), - ] - -def _seg_38() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xA8CE, 'V'), (0xA8DA, 'X'), (0xA8E0, 'V'), (0xA954, 'X'), + ] + +def _seg_38() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xA95F, 'V'), (0xA97D, 'X'), (0xA980, 'V'), @@ -4062,14 +4058,14 @@ def _seg_38() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xABA8, 'M', 'Ꮨ'), (0xABA9, 'M', 'Ꮩ'), (0xABAA, 'M', 'Ꮪ'), - ] - -def _seg_39() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xABAB, 'M', 'Ꮫ'), (0xABAC, 'M', 'Ꮬ'), (0xABAD, 'M', 'Ꮭ'), (0xABAE, 'M', 'Ꮮ'), + ] + +def _seg_39() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xABAF, 'M', 'Ꮯ'), (0xABB0, 'M', 'Ꮰ'), (0xABB1, 'M', 'Ꮱ'), @@ -4166,14 +4162,14 @@ def _seg_39() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xF943, 'M', '弄'), (0xF944, 'M', '籠'), (0xF945, 'M', '聾'), - ] - -def _seg_40() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xF946, 'M', '牢'), (0xF947, 'M', '磊'), (0xF948, 'M', '賂'), (0xF949, 'M', '雷'), + ] + +def _seg_40() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xF94A, 'M', '壘'), (0xF94B, 'M', '屢'), (0xF94C, 'M', '樓'), @@ -4270,14 +4266,14 @@ def _seg_40() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xF9A7, 'M', '獵'), (0xF9A8, 'M', '令'), (0xF9A9, 'M', '囹'), - ] - -def _seg_41() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xF9AA, 'M', '寧'), (0xF9AB, 'M', '嶺'), (0xF9AC, 'M', '怜'), (0xF9AD, 'M', '玲'), + ] + +def _seg_41() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xF9AE, 'M', '瑩'), (0xF9AF, 'M', '羚'), (0xF9B0, 'M', '聆'), @@ -4374,14 +4370,14 @@ def _seg_41() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFA0B, 'M', '廓'), (0xFA0C, 'M', '兀'), (0xFA0D, 'M', '嗀'), - ] - -def _seg_42() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFA0E, 'V'), (0xFA10, 'M', '塚'), (0xFA11, 'V'), (0xFA12, 'M', '晴'), + ] + +def _seg_42() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFA13, 'V'), (0xFA15, 'M', '凞'), (0xFA16, 'M', '猪'), @@ -4478,14 +4474,14 @@ def _seg_42() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFA76, 'M', '勇'), (0xFA77, 'M', '勺'), (0xFA78, 'M', '喝'), - ] - -def _seg_43() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFA79, 'M', '啕'), (0xFA7A, 'M', '喙'), (0xFA7B, 'M', '嗢'), (0xFA7C, 'M', '塚'), + ] + +def _seg_43() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFA7D, 'M', '墳'), (0xFA7E, 'M', '奄'), (0xFA7F, 'M', '奔'), @@ -4582,14 +4578,14 @@ def _seg_43() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFADA, 'X'), (0xFB00, 'M', 'ff'), (0xFB01, 'M', 'fi'), - ] - -def _seg_44() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFB02, 'M', 'fl'), (0xFB03, 'M', 'ffi'), (0xFB04, 'M', 'ffl'), (0xFB05, 'M', 'st'), + ] + +def _seg_44() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFB07, 'X'), (0xFB13, 'M', 'մն'), (0xFB14, 'M', 'մե'), @@ -4686,14 +4682,14 @@ def _seg_44() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFBDB, 'M', 'ۈ'), (0xFBDD, 'M', 'ۇٴ'), (0xFBDE, 'M', 'ۋ'), - ] - -def _seg_45() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFBE0, 'M', 'ۅ'), (0xFBE2, 'M', 'ۉ'), (0xFBE4, 'M', 'ې'), (0xFBE8, 'M', 'ى'), + ] + +def _seg_45() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFBEA, 'M', 'ئا'), (0xFBEC, 'M', 'ئە'), (0xFBEE, 'M', 'ئو'), @@ -4790,14 +4786,14 @@ def _seg_45() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFC54, 'M', 'هي'), (0xFC55, 'M', 'يج'), (0xFC56, 'M', 'يح'), - ] - -def _seg_46() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFC57, 'M', 'يخ'), (0xFC58, 'M', 'يم'), (0xFC59, 'M', 'يى'), (0xFC5A, 'M', 'يي'), + ] + +def _seg_46() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFC5B, 'M', 'ذٰ'), (0xFC5C, 'M', 'رٰ'), (0xFC5D, 'M', 'ىٰ'), @@ -4894,14 +4890,14 @@ def _seg_46() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFCB8, 'M', 'طح'), (0xFCB9, 'M', 'ظم'), (0xFCBA, 'M', 'عج'), - ] - -def _seg_47() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFCBB, 'M', 'عم'), (0xFCBC, 'M', 'غج'), (0xFCBD, 'M', 'غم'), (0xFCBE, 'M', 'فج'), + ] + +def _seg_47() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFCBF, 'M', 'فح'), (0xFCC0, 'M', 'فخ'), (0xFCC1, 'M', 'فم'), @@ -4998,14 +4994,14 @@ def _seg_47() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFD1C, 'M', 'حي'), (0xFD1D, 'M', 'جى'), (0xFD1E, 'M', 'جي'), - ] - -def _seg_48() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFD1F, 'M', 'خى'), (0xFD20, 'M', 'خي'), (0xFD21, 'M', 'صى'), (0xFD22, 'M', 'صي'), + ] + +def _seg_48() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFD23, 'M', 'ضى'), (0xFD24, 'M', 'ضي'), (0xFD25, 'M', 'شج'), @@ -5102,14 +5098,14 @@ def _seg_48() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFDA4, 'M', 'تمى'), (0xFDA5, 'M', 'جمي'), (0xFDA6, 'M', 'جحى'), - ] - -def _seg_49() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFDA7, 'M', 'جمى'), (0xFDA8, 'M', 'سخى'), (0xFDA9, 'M', 'صحي'), (0xFDAA, 'M', 'شحي'), + ] + +def _seg_49() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFDAB, 'M', 'ضحي'), (0xFDAC, 'M', 'لجي'), (0xFDAD, 'M', 'لمي'), @@ -5206,14 +5202,14 @@ def _seg_49() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFE5B, '3', '{'), (0xFE5C, '3', '}'), (0xFE5D, 'M', '〔'), - ] - -def _seg_50() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFE5E, 'M', '〕'), (0xFE5F, '3', '#'), (0xFE60, '3', '&'), (0xFE61, '3', '*'), + ] + +def _seg_50() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFE62, '3', '+'), (0xFE63, 'M', '-'), (0xFE64, '3', '<'), @@ -5310,14 +5306,14 @@ def _seg_50() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFF18, 'M', '8'), (0xFF19, 'M', '9'), (0xFF1A, '3', ':'), - ] - -def _seg_51() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFF1B, '3', ';'), (0xFF1C, '3', '<'), (0xFF1D, '3', '='), (0xFF1E, '3', '>'), + ] + +def _seg_51() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFF1F, '3', '?'), (0xFF20, '3', '@'), (0xFF21, 'M', 'a'), @@ -5414,14 +5410,14 @@ def _seg_51() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFF7C, 'M', 'シ'), (0xFF7D, 'M', 'ス'), (0xFF7E, 'M', 'セ'), - ] - -def _seg_52() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFF7F, 'M', 'ソ'), (0xFF80, 'M', 'タ'), (0xFF81, 'M', 'チ'), (0xFF82, 'M', 'ツ'), + ] + +def _seg_52() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFF83, 'M', 'テ'), (0xFF84, 'M', 'ト'), (0xFF85, 'M', 'ナ'), @@ -5518,14 +5514,14 @@ def _seg_52() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0xFFE7, 'X'), (0xFFE8, 'M', '│'), (0xFFE9, 'M', '←'), - ] - -def _seg_53() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0xFFEA, 'M', '↑'), (0xFFEB, 'M', '→'), (0xFFEC, 'M', '↓'), (0xFFED, 'M', '■'), + ] + +def _seg_53() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0xFFEE, 'M', '○'), (0xFFEF, 'X'), (0x10000, 'V'), @@ -5622,14 +5618,14 @@ def _seg_53() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x104B3, 'M', '𐓛'), (0x104B4, 'M', '𐓜'), (0x104B5, 'M', '𐓝'), - ] - -def _seg_54() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x104B6, 'M', '𐓞'), (0x104B7, 'M', '𐓟'), (0x104B8, 'M', '𐓠'), (0x104B9, 'M', '𐓡'), + ] + +def _seg_54() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x104BA, 'M', '𐓢'), (0x104BB, 'M', '𐓣'), (0x104BC, 'M', '𐓤'), @@ -5726,14 +5722,14 @@ def _seg_54() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x10786, 'X'), (0x10787, 'M', 'ʣ'), (0x10788, 'M', 'ꭦ'), - ] - -def _seg_55() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x10789, 'M', 'ʥ'), (0x1078A, 'M', 'ʤ'), (0x1078B, 'M', 'ɖ'), (0x1078C, 'M', 'ɗ'), + ] + +def _seg_55() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1078D, 'M', 'ᶑ'), (0x1078E, 'M', 'ɘ'), (0x1078F, 'M', 'ɞ'), @@ -5830,14 +5826,14 @@ def _seg_55() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x10A60, 'V'), (0x10AA0, 'X'), (0x10AC0, 'V'), - ] - -def _seg_56() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x10AE7, 'X'), (0x10AEB, 'V'), (0x10AF7, 'X'), (0x10B00, 'V'), + ] + +def _seg_56() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x10B36, 'X'), (0x10B39, 'V'), (0x10B56, 'X'), @@ -5934,14 +5930,14 @@ def _seg_56() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1107F, 'V'), (0x110BD, 'X'), (0x110BE, 'V'), - ] - -def _seg_57() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x110C3, 'X'), (0x110D0, 'V'), (0x110E9, 'X'), (0x110F0, 'V'), + ] + +def _seg_57() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x110FA, 'X'), (0x11100, 'V'), (0x11135, 'X'), @@ -6038,14 +6034,14 @@ def _seg_57() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x118A4, 'M', '𑣄'), (0x118A5, 'M', '𑣅'), (0x118A6, 'M', '𑣆'), - ] - -def _seg_58() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x118A7, 'M', '𑣇'), (0x118A8, 'M', '𑣈'), (0x118A9, 'M', '𑣉'), (0x118AA, 'M', '𑣊'), + ] + +def _seg_58() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x118AB, 'M', '𑣋'), (0x118AC, 'M', '𑣌'), (0x118AD, 'M', '𑣍'), @@ -6142,14 +6138,14 @@ def _seg_58() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x11EE0, 'V'), (0x11EF9, 'X'), (0x11F00, 'V'), - ] - -def _seg_59() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x11F11, 'X'), (0x11F12, 'V'), (0x11F3B, 'X'), (0x11F3E, 'V'), + ] + +def _seg_59() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x11F5A, 'X'), (0x11FB0, 'V'), (0x11FB1, 'X'), @@ -6246,14 +6242,14 @@ def _seg_59() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x18D00, 'V'), (0x18D09, 'X'), (0x1AFF0, 'V'), - ] - -def _seg_60() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1AFF4, 'X'), (0x1AFF5, 'V'), (0x1AFFC, 'X'), (0x1AFFD, 'V'), + ] + +def _seg_60() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1AFFF, 'X'), (0x1B000, 'V'), (0x1B123, 'X'), @@ -6350,14 +6346,14 @@ def _seg_60() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D41E, 'M', 'e'), (0x1D41F, 'M', 'f'), (0x1D420, 'M', 'g'), - ] - -def _seg_61() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D421, 'M', 'h'), (0x1D422, 'M', 'i'), (0x1D423, 'M', 'j'), (0x1D424, 'M', 'k'), + ] + +def _seg_61() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D425, 'M', 'l'), (0x1D426, 'M', 'm'), (0x1D427, 'M', 'n'), @@ -6454,14 +6450,14 @@ def _seg_61() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D482, 'M', 'a'), (0x1D483, 'M', 'b'), (0x1D484, 'M', 'c'), - ] - -def _seg_62() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D485, 'M', 'd'), (0x1D486, 'M', 'e'), (0x1D487, 'M', 'f'), (0x1D488, 'M', 'g'), + ] + +def _seg_62() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D489, 'M', 'h'), (0x1D48A, 'M', 'i'), (0x1D48B, 'M', 'j'), @@ -6558,14 +6554,14 @@ def _seg_62() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D4E9, 'M', 'z'), (0x1D4EA, 'M', 'a'), (0x1D4EB, 'M', 'b'), - ] - -def _seg_63() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D4EC, 'M', 'c'), (0x1D4ED, 'M', 'd'), (0x1D4EE, 'M', 'e'), (0x1D4EF, 'M', 'f'), + ] + +def _seg_63() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D4F0, 'M', 'g'), (0x1D4F1, 'M', 'h'), (0x1D4F2, 'M', 'i'), @@ -6662,14 +6658,14 @@ def _seg_63() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D550, 'M', 'y'), (0x1D551, 'X'), (0x1D552, 'M', 'a'), - ] - -def _seg_64() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D553, 'M', 'b'), (0x1D554, 'M', 'c'), (0x1D555, 'M', 'd'), (0x1D556, 'M', 'e'), + ] + +def _seg_64() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D557, 'M', 'f'), (0x1D558, 'M', 'g'), (0x1D559, 'M', 'h'), @@ -6766,14 +6762,14 @@ def _seg_64() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D5B4, 'M', 'u'), (0x1D5B5, 'M', 'v'), (0x1D5B6, 'M', 'w'), - ] - -def _seg_65() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D5B7, 'M', 'x'), (0x1D5B8, 'M', 'y'), (0x1D5B9, 'M', 'z'), (0x1D5BA, 'M', 'a'), + ] + +def _seg_65() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D5BB, 'M', 'b'), (0x1D5BC, 'M', 'c'), (0x1D5BD, 'M', 'd'), @@ -6870,14 +6866,14 @@ def _seg_65() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D618, 'M', 'q'), (0x1D619, 'M', 'r'), (0x1D61A, 'M', 's'), - ] - -def _seg_66() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D61B, 'M', 't'), (0x1D61C, 'M', 'u'), (0x1D61D, 'M', 'v'), (0x1D61E, 'M', 'w'), + ] + +def _seg_66() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D61F, 'M', 'x'), (0x1D620, 'M', 'y'), (0x1D621, 'M', 'z'), @@ -6974,14 +6970,14 @@ def _seg_66() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D67C, 'M', 'm'), (0x1D67D, 'M', 'n'), (0x1D67E, 'M', 'o'), - ] - -def _seg_67() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D67F, 'M', 'p'), (0x1D680, 'M', 'q'), (0x1D681, 'M', 'r'), (0x1D682, 'M', 's'), + ] + +def _seg_67() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D683, 'M', 't'), (0x1D684, 'M', 'u'), (0x1D685, 'M', 'v'), @@ -7078,14 +7074,14 @@ def _seg_67() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D6E2, 'M', 'α'), (0x1D6E3, 'M', 'β'), (0x1D6E4, 'M', 'γ'), - ] - -def _seg_68() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D6E5, 'M', 'δ'), (0x1D6E6, 'M', 'ε'), (0x1D6E7, 'M', 'ζ'), (0x1D6E8, 'M', 'η'), + ] + +def _seg_68() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D6E9, 'M', 'θ'), (0x1D6EA, 'M', 'ι'), (0x1D6EB, 'M', 'κ'), @@ -7182,14 +7178,14 @@ def _seg_68() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D747, 'M', 'σ'), (0x1D749, 'M', 'τ'), (0x1D74A, 'M', 'υ'), - ] - -def _seg_69() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D74B, 'M', 'φ'), (0x1D74C, 'M', 'χ'), (0x1D74D, 'M', 'ψ'), (0x1D74E, 'M', 'ω'), + ] + +def _seg_69() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D74F, 'M', '∂'), (0x1D750, 'M', 'ε'), (0x1D751, 'M', 'θ'), @@ -7286,14 +7282,14 @@ def _seg_69() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1D7AD, 'M', 'δ'), (0x1D7AE, 'M', 'ε'), (0x1D7AF, 'M', 'ζ'), - ] - -def _seg_70() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1D7B0, 'M', 'η'), (0x1D7B1, 'M', 'θ'), (0x1D7B2, 'M', 'ι'), (0x1D7B3, 'M', 'κ'), + ] + +def _seg_70() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1D7B4, 'M', 'λ'), (0x1D7B5, 'M', 'μ'), (0x1D7B6, 'M', 'ν'), @@ -7390,14 +7386,14 @@ def _seg_70() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1E030, 'M', 'а'), (0x1E031, 'M', 'б'), (0x1E032, 'M', 'в'), - ] - -def _seg_71() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1E033, 'M', 'г'), (0x1E034, 'M', 'д'), (0x1E035, 'M', 'е'), (0x1E036, 'M', 'ж'), + ] + +def _seg_71() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1E037, 'M', 'з'), (0x1E038, 'M', 'и'), (0x1E039, 'M', 'к'), @@ -7494,14 +7490,14 @@ def _seg_71() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1E907, 'M', '𞤩'), (0x1E908, 'M', '𞤪'), (0x1E909, 'M', '𞤫'), - ] - -def _seg_72() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1E90A, 'M', '𞤬'), (0x1E90B, 'M', '𞤭'), (0x1E90C, 'M', '𞤮'), (0x1E90D, 'M', '𞤯'), + ] + +def _seg_72() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1E90E, 'M', '𞤰'), (0x1E90F, 'M', '𞤱'), (0x1E910, 'M', '𞤲'), @@ -7598,14 +7594,14 @@ def _seg_72() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1EE48, 'X'), (0x1EE49, 'M', 'ي'), (0x1EE4A, 'X'), - ] - -def _seg_73() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1EE4B, 'M', 'ل'), (0x1EE4C, 'X'), (0x1EE4D, 'M', 'ن'), (0x1EE4E, 'M', 'س'), + ] + +def _seg_73() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1EE4F, 'M', 'ع'), (0x1EE50, 'X'), (0x1EE51, 'M', 'ص'), @@ -7702,14 +7698,14 @@ def _seg_73() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1EEB2, 'M', 'ق'), (0x1EEB3, 'M', 'ر'), (0x1EEB4, 'M', 'ش'), - ] - -def _seg_74() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1EEB5, 'M', 'ت'), (0x1EEB6, 'M', 'ث'), (0x1EEB7, 'M', 'خ'), (0x1EEB8, 'M', 'ذ'), + ] + +def _seg_74() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1EEB9, 'M', 'ض'), (0x1EEBA, 'M', 'ظ'), (0x1EEBB, 'M', 'غ'), @@ -7806,14 +7802,14 @@ def _seg_74() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1F150, 'V'), (0x1F16A, 'M', 'mc'), (0x1F16B, 'M', 'md'), - ] - -def _seg_75() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1F16C, 'M', 'mr'), (0x1F16D, 'V'), (0x1F190, 'M', 'dj'), (0x1F191, 'V'), + ] + +def _seg_75() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1F1AE, 'X'), (0x1F1E6, 'V'), (0x1F200, 'M', 'ほか'), @@ -7910,14 +7906,14 @@ def _seg_75() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x1FA54, 'X'), (0x1FA60, 'V'), (0x1FA6E, 'X'), - ] - -def _seg_76() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: - return [ (0x1FA70, 'V'), (0x1FA7D, 'X'), (0x1FA80, 'V'), (0x1FA89, 'X'), + ] + +def _seg_76() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ (0x1FA90, 'V'), (0x1FABE, 'X'), (0x1FABF, 'V'), @@ -7953,6 +7949,8 @@ def _seg_76() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2CEA2, 'X'), (0x2CEB0, 'V'), (0x2EBE1, 'X'), + (0x2EBF0, 'V'), + (0x2EE5E, 'X'), (0x2F800, 'M', '丽'), (0x2F801, 'M', '丸'), (0x2F802, 'M', '乁'), @@ -8014,12 +8012,12 @@ def _seg_76() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2F83C, 'M', '咞'), (0x2F83D, 'M', '吸'), (0x2F83E, 'M', '呈'), + (0x2F83F, 'M', '周'), + (0x2F840, 'M', '咢'), ] def _seg_77() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: return [ - (0x2F83F, 'M', '周'), - (0x2F840, 'M', '咢'), (0x2F841, 'M', '哶'), (0x2F842, 'M', '唐'), (0x2F843, 'M', '啓'), @@ -8118,12 +8116,12 @@ def _seg_77() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2F8A4, 'M', '𢛔'), (0x2F8A5, 'M', '惇'), (0x2F8A6, 'M', '慈'), + (0x2F8A7, 'M', '慌'), + (0x2F8A8, 'M', '慎'), ] def _seg_78() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: return [ - (0x2F8A7, 'M', '慌'), - (0x2F8A8, 'M', '慎'), (0x2F8A9, 'M', '慌'), (0x2F8AA, 'M', '慺'), (0x2F8AB, 'M', '憎'), @@ -8222,12 +8220,12 @@ def _seg_78() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2F908, 'M', '港'), (0x2F909, 'M', '湮'), (0x2F90A, 'M', '㴳'), + (0x2F90B, 'M', '滋'), + (0x2F90C, 'M', '滇'), ] def _seg_79() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: return [ - (0x2F90B, 'M', '滋'), - (0x2F90C, 'M', '滇'), (0x2F90D, 'M', '𣻑'), (0x2F90E, 'M', '淹'), (0x2F90F, 'M', '潮'), @@ -8326,12 +8324,12 @@ def _seg_79() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2F96F, 'M', '縂'), (0x2F970, 'M', '繅'), (0x2F971, 'M', '䌴'), + (0x2F972, 'M', '𦈨'), + (0x2F973, 'M', '𦉇'), ] def _seg_80() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: return [ - (0x2F972, 'M', '𦈨'), - (0x2F973, 'M', '𦉇'), (0x2F974, 'M', '䍙'), (0x2F975, 'M', '𦋙'), (0x2F976, 'M', '罺'), @@ -8430,12 +8428,12 @@ def _seg_80() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: (0x2F9D3, 'M', '𧲨'), (0x2F9D4, 'M', '貫'), (0x2F9D5, 'M', '賁'), + (0x2F9D6, 'M', '贛'), + (0x2F9D7, 'M', '起'), ] def _seg_81() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: return [ - (0x2F9D6, 'M', '贛'), - (0x2F9D7, 'M', '起'), (0x2F9D8, 'M', '𧼯'), (0x2F9D9, 'M', '𠠄'), (0x2F9DA, 'M', '跋'), diff --git a/lib/oauthlib/__init__.py b/lib/oauthlib/__init__.py index 5dbffc96..d9a5e38e 100644 --- a/lib/oauthlib/__init__.py +++ b/lib/oauthlib/__init__.py @@ -12,7 +12,7 @@ import logging from logging import NullHandler __author__ = 'The OAuthlib Community' -__version__ = '3.2.0' +__version__ = '3.2.2' logging.getLogger('oauthlib').addHandler(NullHandler()) diff --git a/lib/oauthlib/common.py b/lib/oauthlib/common.py index b5fbf52e..395e75ef 100644 --- a/lib/oauthlib/common.py +++ b/lib/oauthlib/common.py @@ -18,11 +18,9 @@ from urllib.parse import ( from . import get_debug try: - from secrets import randbits - from secrets import SystemRandom + from secrets import SystemRandom, randbits except ImportError: - from random import getrandbits as randbits - from random import SystemRandom + from random import SystemRandom, getrandbits as randbits UNICODE_ASCII_CHARACTER_SET = ('abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' diff --git a/lib/oauthlib/oauth1/__init__.py b/lib/oauthlib/oauth1/__init__.py index 07ef4226..9caf12a9 100644 --- a/lib/oauthlib/oauth1/__init__.py +++ b/lib/oauthlib/oauth1/__init__.py @@ -5,24 +5,19 @@ oauthlib.oauth1 This module is a wrapper for the most recent implementation of OAuth 1.0 Client and Server classes. """ -from .rfc5849 import Client -from .rfc5849 import (SIGNATURE_HMAC, - SIGNATURE_HMAC_SHA1, - SIGNATURE_HMAC_SHA256, - SIGNATURE_HMAC_SHA512, - SIGNATURE_RSA, - SIGNATURE_RSA_SHA1, - SIGNATURE_RSA_SHA256, - SIGNATURE_RSA_SHA512, - SIGNATURE_PLAINTEXT) -from .rfc5849 import SIGNATURE_TYPE_AUTH_HEADER, SIGNATURE_TYPE_QUERY -from .rfc5849 import SIGNATURE_TYPE_BODY +from .rfc5849 import ( + SIGNATURE_HMAC, SIGNATURE_HMAC_SHA1, SIGNATURE_HMAC_SHA256, + SIGNATURE_HMAC_SHA512, SIGNATURE_PLAINTEXT, SIGNATURE_RSA, + SIGNATURE_RSA_SHA1, SIGNATURE_RSA_SHA256, SIGNATURE_RSA_SHA512, + SIGNATURE_TYPE_AUTH_HEADER, SIGNATURE_TYPE_BODY, SIGNATURE_TYPE_QUERY, + Client, +) +from .rfc5849.endpoints import ( + AccessTokenEndpoint, AuthorizationEndpoint, RequestTokenEndpoint, + ResourceEndpoint, SignatureOnlyEndpoint, WebApplicationServer, +) +from .rfc5849.errors import ( + InsecureTransportError, InvalidClientError, InvalidRequestError, + InvalidSignatureMethodError, OAuth1Error, +) from .rfc5849.request_validator import RequestValidator -from .rfc5849.endpoints import RequestTokenEndpoint, AuthorizationEndpoint -from .rfc5849.endpoints import AccessTokenEndpoint, ResourceEndpoint -from .rfc5849.endpoints import SignatureOnlyEndpoint, WebApplicationServer -from .rfc5849.errors import (InsecureTransportError, - InvalidClientError, - InvalidRequestError, - InvalidSignatureMethodError, - OAuth1Error) diff --git a/lib/oauthlib/oauth1/rfc5849/endpoints/base.py b/lib/oauthlib/oauth1/rfc5849/endpoints/base.py index 3a8c2677..7831be7c 100644 --- a/lib/oauthlib/oauth1/rfc5849/endpoints/base.py +++ b/lib/oauthlib/oauth1/rfc5849/endpoints/base.py @@ -11,12 +11,11 @@ import time from oauthlib.common import CaseInsensitiveDict, Request, generate_token from .. import ( - CONTENT_TYPE_FORM_URLENCODED, - SIGNATURE_HMAC_SHA1, SIGNATURE_HMAC_SHA256, SIGNATURE_HMAC_SHA512, - SIGNATURE_RSA_SHA1, SIGNATURE_RSA_SHA256, SIGNATURE_RSA_SHA512, - SIGNATURE_PLAINTEXT, - SIGNATURE_TYPE_AUTH_HEADER, SIGNATURE_TYPE_BODY, - SIGNATURE_TYPE_QUERY, errors, signature, utils) + CONTENT_TYPE_FORM_URLENCODED, SIGNATURE_HMAC_SHA1, SIGNATURE_HMAC_SHA256, + SIGNATURE_HMAC_SHA512, SIGNATURE_PLAINTEXT, SIGNATURE_RSA_SHA1, + SIGNATURE_RSA_SHA256, SIGNATURE_RSA_SHA512, SIGNATURE_TYPE_AUTH_HEADER, + SIGNATURE_TYPE_BODY, SIGNATURE_TYPE_QUERY, errors, signature, utils, +) class BaseEndpoint: diff --git a/lib/oauthlib/oauth1/rfc5849/endpoints/request_token.py b/lib/oauthlib/oauth1/rfc5849/endpoints/request_token.py index bb67e71e..0323cfb8 100644 --- a/lib/oauthlib/oauth1/rfc5849/endpoints/request_token.py +++ b/lib/oauthlib/oauth1/rfc5849/endpoints/request_token.py @@ -152,7 +152,7 @@ class RequestTokenEndpoint(BaseEndpoint): request.client_key = self.request_validator.dummy_client # Note that `realm`_ is only used in authorization headers and how - # it should be interepreted is not included in the OAuth spec. + # it should be interpreted is not included in the OAuth spec. # However they could be seen as a scope or realm to which the # client has access and as such every client should be checked # to ensure it is authorized access to that scope or realm. @@ -164,7 +164,7 @@ class RequestTokenEndpoint(BaseEndpoint): # workflow where a client requests access to a specific realm. # This first step (obtaining request token) need not require a realm # and can then be identified by checking the require_resource_owner - # flag and abscence of realm. + # flag and absence of realm. # # Clients obtaining an access token will not supply a realm and it will # not be checked. Instead the previously requested realm should be diff --git a/lib/oauthlib/oauth1/rfc5849/endpoints/resource.py b/lib/oauthlib/oauth1/rfc5849/endpoints/resource.py index 45bdaaac..8641152e 100644 --- a/lib/oauthlib/oauth1/rfc5849/endpoints/resource.py +++ b/lib/oauthlib/oauth1/rfc5849/endpoints/resource.py @@ -113,7 +113,7 @@ class ResourceEndpoint(BaseEndpoint): request.resource_owner_key = self.request_validator.dummy_access_token # Note that `realm`_ is only used in authorization headers and how - # it should be interepreted is not included in the OAuth spec. + # it should be interpreted is not included in the OAuth spec. # However they could be seen as a scope or realm to which the # client has access and as such every client should be checked # to ensure it is authorized access to that scope or realm. @@ -125,7 +125,7 @@ class ResourceEndpoint(BaseEndpoint): # workflow where a client requests access to a specific realm. # This first step (obtaining request token) need not require a realm # and can then be identified by checking the require_resource_owner - # flag and abscence of realm. + # flag and absence of realm. # # Clients obtaining an access token will not supply a realm and it will # not be checked. Instead the previously requested realm should be diff --git a/lib/oauthlib/oauth1/rfc5849/request_validator.py b/lib/oauthlib/oauth1/rfc5849/request_validator.py index dc5bf0eb..e937aabf 100644 --- a/lib/oauthlib/oauth1/rfc5849/request_validator.py +++ b/lib/oauthlib/oauth1/rfc5849/request_validator.py @@ -19,7 +19,7 @@ class RequestValidator: Methods used to check the format of input parameters. Common tests include length, character set, membership, range or pattern. These tests are referred to as `whitelisting or blacklisting`_. Whitelisting is better - but blacklisting can be usefull to spot malicious activity. + but blacklisting can be useful to spot malicious activity. The following have methods a default implementation: - check_client_key @@ -443,7 +443,7 @@ class RequestValidator: :type request: oauthlib.common.Request :returns: None - Per `Section 2.3`__ of the spec: + Per `Section 2.3`_ of the spec: "The server MUST (...) ensure that the temporary credentials have not expired or been used before." @@ -831,7 +831,7 @@ class RequestValidator: """Associate an authorization verifier with a request token. :param token: A request token string. - :param verifier A dictionary containing the oauth_verifier and + :param verifier: A dictionary containing the oauth_verifier and oauth_token :param request: OAuthlib request. :type request: oauthlib.common.Request diff --git a/lib/oauthlib/oauth1/rfc5849/signature.py b/lib/oauthlib/oauth1/rfc5849/signature.py index a370ccd6..9cb1a517 100644 --- a/lib/oauthlib/oauth1/rfc5849/signature.py +++ b/lib/oauthlib/oauth1/rfc5849/signature.py @@ -37,15 +37,15 @@ should have no impact on properly behaving programs. import binascii import hashlib import hmac +import ipaddress import logging +import urllib.parse as urlparse import warnings from oauthlib.common import extract_params, safe_string_equals, urldecode -import urllib.parse as urlparse from . import utils - log = logging.getLogger(__name__) @@ -131,7 +131,12 @@ def base_string_uri(uri: str, host: str = None) -> str: raise ValueError('uri must be a string.') # FIXME: urlparse does not support unicode - scheme, netloc, path, params, query, fragment = urlparse.urlparse(uri) + output = urlparse.urlparse(uri) + scheme = output.scheme + hostname = output.hostname + port = output.port + path = output.path + params = output.params # The scheme, authority, and path of the request resource URI `RFC3986` # are included by constructing an "http" or "https" URI representing @@ -153,13 +158,22 @@ def base_string_uri(uri: str, host: str = None) -> str: # 1. The scheme and host MUST be in lowercase. scheme = scheme.lower() - netloc = netloc.lower() # Note: if ``host`` is used, it will be converted to lowercase below + if hostname is not None: + hostname = hostname.lower() # 2. The host and port values MUST match the content of the HTTP # request "Host" header field. if host is not None: - netloc = host.lower() # override value in uri with provided host + # NOTE: override value in uri with provided host + # Host argument is equal to netloc. It means it's missing scheme. + # Add it back, before parsing. + + host = host.lower() + host = f"{scheme}://{host}" + output = urlparse.urlparse(host) + hostname = output.hostname + port = output.port # 3. The port MUST be included if it is not the default port for the # scheme, and MUST be excluded if it is the default. Specifically, @@ -170,33 +184,28 @@ def base_string_uri(uri: str, host: str = None) -> str: # .. _`RFC2616`: https://tools.ietf.org/html/rfc2616 # .. _`RFC2818`: https://tools.ietf.org/html/rfc2818 - if ':' in netloc: - # Contains a colon ":", so try to parse as "host:port" + if hostname is None: + raise ValueError('missing host') - hostname, port_str = netloc.split(':', 1) + # NOTE: Try guessing if we're dealing with IP or hostname + try: + hostname = ipaddress.ip_address(hostname) + except ValueError: + pass - if len(hostname) == 0: - raise ValueError('missing host') # error: netloc was ":port" or ":" + if isinstance(hostname, ipaddress.IPv6Address): + hostname = f"[{hostname}]" + elif isinstance(hostname, ipaddress.IPv4Address): + hostname = f"{hostname}" - if len(port_str) == 0: - netloc = hostname # was "host:", so just use the host part - else: - try: - port_num = int(port_str) # try to parse into an integer number - except ValueError: - raise ValueError('port is not an integer') - - if port_num <= 0 or 65535 < port_num: - raise ValueError('port out of range') # 16-bit unsigned ints - if (scheme, port_num) in (('http', 80), ('https', 443)): - netloc = hostname # default port for scheme: exclude port num - else: - netloc = hostname + ':' + str(port_num) # use hostname:port + if port is not None and not (0 < port <= 65535): + raise ValueError('port out of range') # 16-bit unsigned ints + if (scheme, port) in (('http', 80), ('https', 443)): + netloc = hostname # default port for scheme: exclude port num + elif port: + netloc = f"{hostname}:{port}" # use hostname:port else: - # Does not contain a colon, so entire value must be the hostname - - if len(netloc) == 0: - raise ValueError('missing host') # error: netloc was empty string + netloc = hostname v = urlparse.urlunparse((scheme, netloc, path, params, '', '')) diff --git a/lib/oauthlib/oauth2/rfc6749/clients/backend_application.py b/lib/oauthlib/oauth2/rfc6749/clients/backend_application.py index 0e2a8299..e11e8fae 100644 --- a/lib/oauthlib/oauth2/rfc6749/clients/backend_application.py +++ b/lib/oauthlib/oauth2/rfc6749/clients/backend_application.py @@ -39,7 +39,7 @@ class BackendApplicationClient(Client): format per `Appendix B`_ in the HTTP request entity-body: :param body: Existing request body (URL encoded string) to embed parameters - into. This may contain extra paramters. Default ''. + into. This may contain extra parameters. Default ''. :param scope: The scope of the access request as described by `Section 3.3`_. diff --git a/lib/oauthlib/oauth2/rfc6749/clients/base.py b/lib/oauthlib/oauth2/rfc6749/clients/base.py index bb4c1338..d5eb0cc1 100644 --- a/lib/oauthlib/oauth2/rfc6749/clients/base.py +++ b/lib/oauthlib/oauth2/rfc6749/clients/base.py @@ -6,12 +6,12 @@ oauthlib.oauth2.rfc6749 This module is an implementation of various logic needed for consuming OAuth 2.0 RFC6749. """ +import base64 +import hashlib +import re +import secrets import time import warnings -import secrets -import re -import hashlib -import base64 from oauthlib.common import generate_token from oauthlib.oauth2.rfc6749 import tokens @@ -228,26 +228,21 @@ class Client: required parameters to the authorization URL. :param authorization_url: Provider authorization endpoint URL. - :param state: CSRF protection string. Will be automatically created if - not provided. The generated state is available via the ``state`` - attribute. Clients should verify that the state is unchanged and - present in the authorization response. This verification is done - automatically if using the ``authorization_response`` parameter - with ``prepare_token_request``. - + not provided. The generated state is available via the ``state`` + attribute. Clients should verify that the state is unchanged and + present in the authorization response. This verification is done + automatically if using the ``authorization_response`` parameter + with ``prepare_token_request``. :param redirect_url: Redirect URL to which the user will be returned - after authorization. Must be provided unless previously setup with - the provider. If provided then it must also be provided in the - token request. - + after authorization. Must be provided unless previously setup with + the provider. If provided then it must also be provided in the + token request. :param scope: List of scopes to request. Must be equal to - or a subset of the scopes granted when obtaining the refresh - token. If none is provided, the ones provided in the constructor are - used. - + or a subset of the scopes granted when obtaining the refresh + token. If none is provided, the ones provided in the constructor are + used. :param kwargs: Additional parameters to included in the request. - :returns: The prepared request tuple with (url, headers, body). """ if not is_secure_transport(authorization_url): @@ -271,22 +266,16 @@ class Client: credentials. :param token_url: Provider token creation endpoint URL. - :param authorization_response: The full redirection URL string, i.e. - the location to which the user was redirected after successfull - authorization. Used to mine credentials needed to obtain a token - in this step, such as authorization code. - + the location to which the user was redirected after successful + authorization. Used to mine credentials needed to obtain a token + in this step, such as authorization code. :param redirect_url: The redirect_url supplied with the authorization - request (if there was one). - + request (if there was one). :param state: - :param body: Existing request body (URL encoded string) to embed parameters - into. This may contain extra paramters. Default ''. - + into. This may contain extra parameters. Default ''. :param kwargs: Additional parameters to included in the request. - :returns: The prepared request tuple with (url, headers, body). """ if not is_secure_transport(token_url): @@ -312,19 +301,14 @@ class Client: obtain a new access token, and possibly a new refresh token. :param token_url: Provider token refresh endpoint URL. - :param refresh_token: Refresh token string. - :param body: Existing request body (URL encoded string) to embed parameters - into. This may contain extra paramters. Default ''. - + into. This may contain extra parameters. Default ''. :param scope: List of scopes to request. Must be equal to - or a subset of the scopes granted when obtaining the refresh - token. If none is provided, the ones provided in the constructor are - used. - + or a subset of the scopes granted when obtaining the refresh + token. If none is provided, the ones provided in the constructor are + used. :param kwargs: Additional parameters to included in the request. - :returns: The prepared request tuple with (url, headers, body). """ if not is_secure_transport(token_url): @@ -341,20 +325,14 @@ class Client: """Prepare a token revocation request. :param revocation_url: Provider token revocation endpoint URL. - :param token: The access or refresh token to be revoked (string). - :param token_type_hint: ``"access_token"`` (default) or - ``"refresh_token"``. This is optional and if you wish to not pass it you - must provide ``token_type_hint=None``. - + ``"refresh_token"``. This is optional and if you wish to not pass it you + must provide ``token_type_hint=None``. :param body: - :param callback: A jsonp callback such as ``package.callback`` to be invoked - upon receiving the response. Not that it should not include a () suffix. - + upon receiving the response. Not that it should not include a () suffix. :param kwargs: Additional parameters to included in the request. - :returns: The prepared request tuple with (url, headers, body). Note that JSONP request may use GET requests as the parameters will @@ -362,7 +340,7 @@ class Client: An example of a revocation request - .. code-block: http + .. code-block:: http POST /revoke HTTP/1.1 Host: server.example.com @@ -373,7 +351,7 @@ class Client: An example of a jsonp revocation request - .. code-block: http + .. code-block:: http GET /revoke?token=agabcdefddddafdd&callback=package.myCallback HTTP/1.1 Host: server.example.com @@ -382,9 +360,9 @@ class Client: and an error response - .. code-block: http + .. code-block:: javascript - package.myCallback({"error":"unsupported_token_type"}); + package.myCallback({"error":"unsupported_token_type"}); Note that these requests usually require client credentials, client_id in the case for public clients and provider specific authentication @@ -408,9 +386,10 @@ class Client: :param body: The response body from the token request. :param scope: Scopes originally requested. If none is provided, the ones - provided in the constructor are used. + provided in the constructor are used. :return: Dictionary of token parameters. - :raises: Warning if scope has changed. OAuth2Error if response is invalid. + :raises: Warning if scope has changed. :py:class:`oauthlib.oauth2.errors.OAuth2Error` + if response is invalid. These response are json encoded and could easily be parsed without the assistance of OAuthLib. However, there are a few subtle issues @@ -436,7 +415,7 @@ class Client: If omitted, the authorization server SHOULD provide the expiration time via other means or document the default value. - **scope** + **scope** Providers may supply this in all responses but are required to only if it has changed since the authorization request. @@ -454,20 +433,16 @@ class Client: If the authorization server issued a refresh token to the client, the client makes a refresh request to the token endpoint by adding the - following parameters using the "application/x-www-form-urlencoded" + following parameters using the `application/x-www-form-urlencoded` format in the HTTP request entity-body: - grant_type - REQUIRED. Value MUST be set to "refresh_token". - refresh_token - REQUIRED. The refresh token issued to the client. - scope - OPTIONAL. The scope of the access request as described by - Section 3.3. The requested scope MUST NOT include any scope - not originally granted by the resource owner, and if omitted is - treated as equal to the scope originally granted by the - resource owner. Note that if none is provided, the ones provided - in the constructor are used if any. + :param refresh_token: REQUIRED. The refresh token issued to the client. + :param scope: OPTIONAL. The scope of the access request as described by + Section 3.3. The requested scope MUST NOT include any scope + not originally granted by the resource owner, and if omitted is + treated as equal to the scope originally granted by the + resource owner. Note that if none is provided, the ones provided + in the constructor are used if any. """ refresh_token = refresh_token or self.refresh_token scope = self.scope if scope is None else scope @@ -492,18 +467,21 @@ class Client: def create_code_verifier(self, length): """Create PKCE **code_verifier** used in computing **code_challenge**. + See `RFC7636 Section 4.1`_ - :param length: REQUIRED. The length of the code_verifier. + :param length: REQUIRED. The length of the code_verifier. - The client first creates a code verifier, "code_verifier", for each - OAuth 2.0 [RFC6749] Authorization Request, in the following manner: + The client first creates a code verifier, "code_verifier", for each + OAuth 2.0 [RFC6749] Authorization Request, in the following manner: - code_verifier = high-entropy cryptographic random STRING using the - unreserved characters [A-Z] / [a-z] / [0-9] / "-" / "." / "_" / "~" - from Section 2.3 of [RFC3986], with a minimum length of 43 characters - and a maximum length of 128 characters. - - .. _`Section 4.1`: https://tools.ietf.org/html/rfc7636#section-4.1 + .. code-block:: text + + code_verifier = high-entropy cryptographic random STRING using the + unreserved characters [A-Z] / [a-z] / [0-9] / "-" / "." / "_" / "~" + from Section 2.3 of [RFC3986], with a minimum length of 43 characters + and a maximum length of 128 characters. + + .. _`RFC7636 Section 4.1`: https://tools.ietf.org/html/rfc7636#section-4.1 """ code_verifier = None @@ -525,33 +503,30 @@ class Client: def create_code_challenge(self, code_verifier, code_challenge_method=None): """Create PKCE **code_challenge** derived from the **code_verifier**. + See `RFC7636 Section 4.2`_ - :param code_verifier: REQUIRED. The **code_verifier** generated from create_code_verifier(). - :param code_challenge_method: OPTIONAL. The method used to derive the **code_challenge**. Acceptable - values include "S256". DEFAULT is "plain". + :param code_verifier: REQUIRED. The **code_verifier** generated from `create_code_verifier()`. + :param code_challenge_method: OPTIONAL. The method used to derive the **code_challenge**. Acceptable values include `S256`. DEFAULT is `plain`. - - The client then creates a code challenge derived from the code + The client then creates a code challenge derived from the code verifier by using one of the following transformations on the code - verifier: + verifier:: - plain - code_challenge = code_verifier + plain + code_challenge = code_verifier + S256 + code_challenge = BASE64URL-ENCODE(SHA256(ASCII(code_verifier))) - S256 - code_challenge = BASE64URL-ENCODE(SHA256(ASCII(code_verifier))) - - If the client is capable of using "S256", it MUST use "S256", as - "S256" is Mandatory To Implement (MTI) on the server. Clients are - permitted to use "plain" only if they cannot support "S256" for some + If the client is capable of using `S256`, it MUST use `S256`, as + `S256` is Mandatory To Implement (MTI) on the server. Clients are + permitted to use `plain` only if they cannot support `S256` for some technical reason and know via out-of-band configuration that the - server supports "plain". + server supports `plain`. The plain transformation is for compatibility with existing - deployments and for constrained environments that can't use the S256 - transformation. + deployments and for constrained environments that can't use the S256 transformation. - .. _`Section 4.2`: https://tools.ietf.org/html/rfc7636#section-4.2 + .. _`RFC7636 Section 4.2`: https://tools.ietf.org/html/rfc7636#section-4.2 """ code_challenge = None diff --git a/lib/oauthlib/oauth2/rfc6749/clients/legacy_application.py b/lib/oauthlib/oauth2/rfc6749/clients/legacy_application.py index 7af68f34..9920981d 100644 --- a/lib/oauthlib/oauth2/rfc6749/clients/legacy_application.py +++ b/lib/oauthlib/oauth2/rfc6749/clients/legacy_application.py @@ -49,7 +49,7 @@ class LegacyApplicationClient(Client): :param username: The resource owner username. :param password: The resource owner password. :param body: Existing request body (URL encoded string) to embed parameters - into. This may contain extra paramters. Default ''. + into. This may contain extra parameters. Default ''. :param scope: The scope of the access request as described by `Section 3.3`_. :param include_client_id: `True` to send the `client_id` in the diff --git a/lib/oauthlib/oauth2/rfc6749/clients/mobile_application.py b/lib/oauthlib/oauth2/rfc6749/clients/mobile_application.py index cd325f4e..b10b41ce 100644 --- a/lib/oauthlib/oauth2/rfc6749/clients/mobile_application.py +++ b/lib/oauthlib/oauth2/rfc6749/clients/mobile_application.py @@ -55,7 +55,7 @@ class MobileApplicationClient(Client): using the "application/x-www-form-urlencoded" format, per `Appendix B`_: :param redirect_uri: OPTIONAL. The redirect URI must be an absolute URI - and it should have been registerd with the OAuth + and it should have been registered with the OAuth provider prior to use. As described in `Section 3.1.2`_. :param scope: OPTIONAL. The scope of the access request as described by diff --git a/lib/oauthlib/oauth2/rfc6749/clients/service_application.py b/lib/oauthlib/oauth2/rfc6749/clients/service_application.py index c751c8b0..8fb17377 100644 --- a/lib/oauthlib/oauth2/rfc6749/clients/service_application.py +++ b/lib/oauthlib/oauth2/rfc6749/clients/service_application.py @@ -31,7 +31,7 @@ class ServiceApplicationClient(Client): def __init__(self, client_id, private_key=None, subject=None, issuer=None, audience=None, **kwargs): - """Initalize a JWT client with defaults for implicit use later. + """Initialize a JWT client with defaults for implicit use later. :param client_id: Client identifier given by the OAuth provider upon registration. @@ -99,7 +99,7 @@ class ServiceApplicationClient(Client): :param extra_claims: A dict of additional claims to include in the JWT. :param body: Existing request body (URL encoded string) to embed parameters - into. This may contain extra paramters. Default ''. + into. This may contain extra parameters. Default ''. :param scope: The scope of the access request. diff --git a/lib/oauthlib/oauth2/rfc6749/clients/web_application.py b/lib/oauthlib/oauth2/rfc6749/clients/web_application.py index 1d3b2b5b..50890fbf 100644 --- a/lib/oauthlib/oauth2/rfc6749/clients/web_application.py +++ b/lib/oauthlib/oauth2/rfc6749/clients/web_application.py @@ -49,7 +49,7 @@ class WebApplicationClient(Client): using the "application/x-www-form-urlencoded" format, per `Appendix B`_: :param redirect_uri: OPTIONAL. The redirect URI must be an absolute URI - and it should have been registerd with the OAuth + and it should have been registered with the OAuth provider prior to use. As described in `Section 3.1.2`_. :param scope: OPTIONAL. The scope of the access request as described by @@ -117,7 +117,7 @@ class WebApplicationClient(Client): values MUST be identical. :param body: Existing request body (URL encoded string) to embed parameters - into. This may contain extra paramters. Default ''. + into. This may contain extra parameters. Default ''. :param include_client_id: `True` (default) to send the `client_id` in the body of the upstream request. This is required diff --git a/lib/oauthlib/oauth2/rfc6749/endpoints/introspect.py b/lib/oauthlib/oauth2/rfc6749/endpoints/introspect.py index 63570d9c..3cc61e66 100644 --- a/lib/oauthlib/oauth2/rfc6749/endpoints/introspect.py +++ b/lib/oauthlib/oauth2/rfc6749/endpoints/introspect.py @@ -86,9 +86,9 @@ class IntrospectEndpoint(BaseEndpoint): an HTTP POST request with parameters sent as "application/x-www-form-urlencoded". - token REQUIRED. The string value of the token. + * token REQUIRED. The string value of the token. + * token_type_hint OPTIONAL. - token_type_hint OPTIONAL. A hint about the type of the token submitted for introspection. The protected resource MAY pass this parameter to help the authorization server optimize the token lookup. If the @@ -96,11 +96,9 @@ class IntrospectEndpoint(BaseEndpoint): extend its search across all of its supported token types. An authorization server MAY ignore this parameter, particularly if it is able to detect the token type automatically. - * access_token: An Access Token as defined in [`RFC6749`], - `section 1.4`_ - * refresh_token: A Refresh Token as defined in [`RFC6749`], - `section 1.5`_ + * access_token: An Access Token as defined in [`RFC6749`], `section 1.4`_ + * refresh_token: A Refresh Token as defined in [`RFC6749`], `section 1.5`_ The introspection endpoint MAY accept other OPTIONAL parameters to provide further context to the query. For diff --git a/lib/oauthlib/oauth2/rfc6749/endpoints/metadata.py b/lib/oauthlib/oauth2/rfc6749/endpoints/metadata.py index d43a8247..a2820f28 100644 --- a/lib/oauthlib/oauth2/rfc6749/endpoints/metadata.py +++ b/lib/oauthlib/oauth2/rfc6749/endpoints/metadata.py @@ -10,7 +10,7 @@ import copy import json import logging -from .. import grant_types +from .. import grant_types, utils from .authorization import AuthorizationEndpoint from .base import BaseEndpoint, catch_errors_and_unavailability from .introspect import IntrospectEndpoint @@ -68,7 +68,7 @@ class MetadataEndpoint(BaseEndpoint): raise ValueError("key {} is a mandatory metadata.".format(key)) elif is_issuer: - if not array[key].startswith("https"): + if not utils.is_secure_transport(array[key]): raise ValueError("key {}: {} must be an HTTPS URL".format(key, array[key])) if "?" in array[key] or "&" in array[key] or "#" in array[key]: raise ValueError("key {}: {} must not contain query or fragment components".format(key, array[key])) diff --git a/lib/oauthlib/oauth2/rfc6749/endpoints/revocation.py b/lib/oauthlib/oauth2/rfc6749/endpoints/revocation.py index 4aa5ec6e..596d0860 100644 --- a/lib/oauthlib/oauth2/rfc6749/endpoints/revocation.py +++ b/lib/oauthlib/oauth2/rfc6749/endpoints/revocation.py @@ -42,7 +42,7 @@ class RevocationEndpoint(BaseEndpoint): The authorization server responds with HTTP status code 200 if the - token has been revoked sucessfully or if the client submitted an + token has been revoked successfully or if the client submitted an invalid token. Note: invalid tokens do not cause an error response since the client @@ -95,7 +95,7 @@ class RevocationEndpoint(BaseEndpoint): submitted for revocation. Clients MAY pass this parameter in order to help the authorization server to optimize the token lookup. If the server is unable to locate the token using the given hint, it MUST - extend its search accross all of its supported token types. An + extend its search across all of its supported token types. An authorization server MAY ignore this parameter, particularly if it is able to detect the token type automatically. This specification defines two such values: diff --git a/lib/oauthlib/oauth2/rfc6749/grant_types/authorization_code.py b/lib/oauthlib/oauth2/rfc6749/grant_types/authorization_code.py index b799823e..858855a1 100644 --- a/lib/oauthlib/oauth2/rfc6749/grant_types/authorization_code.py +++ b/lib/oauthlib/oauth2/rfc6749/grant_types/authorization_code.py @@ -10,7 +10,6 @@ import logging from oauthlib import common from .. import errors -from ..utils import is_secure_transport from .base import GrantTypeBase log = logging.getLogger(__name__) @@ -547,20 +546,3 @@ class AuthorizationCodeGrant(GrantTypeBase): if challenge_method in self._code_challenge_methods: return self._code_challenge_methods[challenge_method](verifier, challenge) raise NotImplementedError('Unknown challenge_method %s' % challenge_method) - - def _create_cors_headers(self, request): - """If CORS is allowed, create the appropriate headers.""" - if 'origin' not in request.headers: - return {} - - origin = request.headers['origin'] - if not is_secure_transport(origin): - log.debug('Origin "%s" is not HTTPS, CORS not allowed.', origin) - return {} - elif not self.request_validator.is_origin_allowed( - request.client_id, origin, request): - log.debug('Invalid origin "%s", CORS not allowed.', origin) - return {} - else: - log.debug('Valid origin "%s", injecting CORS headers.', origin) - return {'Access-Control-Allow-Origin': origin} diff --git a/lib/oauthlib/oauth2/rfc6749/grant_types/base.py b/lib/oauthlib/oauth2/rfc6749/grant_types/base.py index a64f168c..ca343a11 100644 --- a/lib/oauthlib/oauth2/rfc6749/grant_types/base.py +++ b/lib/oauthlib/oauth2/rfc6749/grant_types/base.py @@ -10,6 +10,7 @@ from oauthlib.oauth2.rfc6749 import errors, utils from oauthlib.uri_validate import is_absolute_uri from ..request_validator import RequestValidator +from ..utils import is_secure_transport log = logging.getLogger(__name__) @@ -248,3 +249,20 @@ class GrantTypeBase: raise errors.MissingRedirectURIError(request=request) if not is_absolute_uri(request.redirect_uri): raise errors.InvalidRedirectURIError(request=request) + + def _create_cors_headers(self, request): + """If CORS is allowed, create the appropriate headers.""" + if 'origin' not in request.headers: + return {} + + origin = request.headers['origin'] + if not is_secure_transport(origin): + log.debug('Origin "%s" is not HTTPS, CORS not allowed.', origin) + return {} + elif not self.request_validator.is_origin_allowed( + request.client_id, origin, request): + log.debug('Invalid origin "%s", CORS not allowed.', origin) + return {} + else: + log.debug('Valid origin "%s", injecting CORS headers.', origin) + return {'Access-Control-Allow-Origin': origin} diff --git a/lib/oauthlib/oauth2/rfc6749/grant_types/refresh_token.py b/lib/oauthlib/oauth2/rfc6749/grant_types/refresh_token.py index f801de4a..ce33df0e 100644 --- a/lib/oauthlib/oauth2/rfc6749/grant_types/refresh_token.py +++ b/lib/oauthlib/oauth2/rfc6749/grant_types/refresh_token.py @@ -69,6 +69,7 @@ class RefreshTokenGrant(GrantTypeBase): log.debug('Issuing new token to client id %r (%r), %r.', request.client_id, request.client, token) + headers.update(self._create_cors_headers(request)) return headers, json.dumps(token), 200 def validate_token_request(self, request): diff --git a/lib/oauthlib/oauth2/rfc6749/parameters.py b/lib/oauthlib/oauth2/rfc6749/parameters.py index 44738bb4..8f6ce2c7 100644 --- a/lib/oauthlib/oauth2/rfc6749/parameters.py +++ b/lib/oauthlib/oauth2/rfc6749/parameters.py @@ -45,7 +45,7 @@ def prepare_grant_uri(uri, client_id, response_type, redirect_uri=None, back to the client. The parameter SHOULD be used for preventing cross-site request forgery as described in `Section 10.12`_. - :param code_challenge: PKCE paramater. A challenge derived from the + :param code_challenge: PKCE parameter. A challenge derived from the code_verifier that is sent in the authorization request, to be verified against later. :param code_challenge_method: PKCE parameter. A method that was used to derive the diff --git a/lib/oauthlib/oauth2/rfc6749/request_validator.py b/lib/oauthlib/oauth2/rfc6749/request_validator.py index 610a708d..3910c0b9 100644 --- a/lib/oauthlib/oauth2/rfc6749/request_validator.py +++ b/lib/oauthlib/oauth2/rfc6749/request_validator.py @@ -191,6 +191,7 @@ class RequestValidator: claims associated, or `None` in case the token is unknown. Below the list of registered claims you should be interested in: + - scope : space-separated list of scopes - client_id : client identifier - username : human-readable identifier for the resource owner @@ -204,10 +205,10 @@ class RequestValidator: - jti : string identifier for the token Note that most of them are coming directly from JWT RFC. More details - can be found in `Introspect Claims`_ or `_JWT Claims`_. + can be found in `Introspect Claims`_ or `JWT Claims`_. The implementation can use *token_type_hint* to improve lookup - efficency, but must fallback to other types to be compliant with RFC. + efficiency, but must fallback to other types to be compliant with RFC. The dict of claims is added to request.token after this method. @@ -443,6 +444,7 @@ class RequestValidator: - request.user - request.scopes - request.claims (if given) + OBS! The request.user attribute should be set to the resource owner associated with this authorization code. Similarly request.scopes must also be set. @@ -451,6 +453,7 @@ class RequestValidator: If PKCE is enabled (see 'is_pkce_required' and 'save_authorization_code') you MUST set the following based on the information stored: + - request.code_challenge - request.code_challenge_method @@ -561,7 +564,7 @@ class RequestValidator: OBS! The validation should also set the user attribute of the request to a valid resource owner, i.e. request.user = username or similar. If not set you will be unable to associate a token with a user in the - persistance method used (commonly, save_bearer_token). + persistence method used (commonly, save_bearer_token). :param username: Unicode username. :param password: Unicode password. @@ -671,6 +674,7 @@ class RequestValidator: Method is used by: - Authorization Code Grant + - Refresh Token Grant """ return False diff --git a/lib/oauthlib/oauth2/rfc6749/tokens.py b/lib/oauthlib/oauth2/rfc6749/tokens.py index 6284248d..0757d07e 100644 --- a/lib/oauthlib/oauth2/rfc6749/tokens.py +++ b/lib/oauthlib/oauth2/rfc6749/tokens.py @@ -257,6 +257,7 @@ def get_token_from_header(request): class TokenBase: + __slots__ = () def __call__(self, request, refresh_token=False): raise NotImplementedError('Subclasses must implement this method.') diff --git a/lib/oauthlib/oauth2/rfc8628/clients/device.py b/lib/oauthlib/oauth2/rfc8628/clients/device.py index 95c4f5a2..b9ba2150 100644 --- a/lib/oauthlib/oauth2/rfc8628/clients/device.py +++ b/lib/oauthlib/oauth2/rfc8628/clients/device.py @@ -5,12 +5,11 @@ oauthlib.oauth2.rfc8628 This module is an implementation of various logic needed for consuming and providing OAuth 2.0 Device Authorization RFC8628. """ - +from oauthlib.common import add_params_to_uri from oauthlib.oauth2 import BackendApplicationClient, Client from oauthlib.oauth2.rfc6749.errors import InsecureTransportError from oauthlib.oauth2.rfc6749.parameters import prepare_token_request from oauthlib.oauth2.rfc6749.utils import is_secure_transport, list_to_scope -from oauthlib.common import add_params_to_uri class DeviceClient(Client): @@ -62,7 +61,7 @@ class DeviceClient(Client): body. :param body: Existing request body (URL encoded string) to embed parameters - into. This may contain extra paramters. Default ''. + into. This may contain extra parameters. Default ''. :param scope: The scope of the access request as described by `Section 3.3`_. @@ -84,6 +83,8 @@ class DeviceClient(Client): >>> client.prepare_request_body(scope=['hello', 'world']) 'grant_type=urn:ietf:params:oauth:grant-type:device_code&scope=hello+world' + .. _`Section 3.2.1`: https://datatracker.ietf.org/doc/html/rfc6749#section-3.2.1 + .. _`Section 3.3`: https://datatracker.ietf.org/doc/html/rfc6749#section-3.3 .. _`Section 3.4`: https://datatracker.ietf.org/doc/html/rfc8628#section-3.4 """ diff --git a/lib/oauthlib/openid/connect/core/endpoints/userinfo.py b/lib/oauthlib/openid/connect/core/endpoints/userinfo.py index 1c29cc55..7aa2bbe9 100644 --- a/lib/oauthlib/openid/connect/core/endpoints/userinfo.py +++ b/lib/oauthlib/openid/connect/core/endpoints/userinfo.py @@ -69,7 +69,7 @@ class UserInfoEndpoint(BaseEndpoint): 5.3.1. UserInfo Request The Client sends the UserInfo Request using either HTTP GET or HTTP POST. The Access Token obtained from an OpenID Connect Authentication - Request MUST be sent as a Bearer Token, per Section 2 of OAuth 2.0 + Request MUST be sent as a Bearer Token, per `Section 2`_ of OAuth 2.0 Bearer Token Usage [RFC6750]. It is RECOMMENDED that the request use the HTTP GET method and the @@ -77,21 +77,28 @@ class UserInfoEndpoint(BaseEndpoint): The following is a non-normative example of a UserInfo Request: - GET /userinfo HTTP/1.1 - Host: server.example.com - Authorization: Bearer SlAV32hkKG + .. code-block:: http + + GET /userinfo HTTP/1.1 + Host: server.example.com + Authorization: Bearer SlAV32hkKG 5.3.3. UserInfo Error Response When an error condition occurs, the UserInfo Endpoint returns an Error - Response as defined in Section 3 of OAuth 2.0 Bearer Token Usage + Response as defined in `Section 3`_ of OAuth 2.0 Bearer Token Usage [RFC6750]. (HTTP errors unrelated to RFC 6750 are returned to the User Agent using the appropriate HTTP status code.) The following is a non-normative example of a UserInfo Error Response: - HTTP/1.1 401 Unauthorized - WWW-Authenticate: Bearer error="invalid_token", + .. code-block:: http + + HTTP/1.1 401 Unauthorized + WWW-Authenticate: Bearer error="invalid_token", error_description="The Access Token expired" + + .. _`Section 2`: https://datatracker.ietf.org/doc/html/rfc6750#section-2 + .. _`Section 3`: https://datatracker.ietf.org/doc/html/rfc6750#section-3 """ if not self.bearer.validate_request(request): raise errors.InvalidTokenError() diff --git a/lib/oauthlib/openid/connect/core/grant_types/base.py b/lib/oauthlib/openid/connect/core/grant_types/base.py index 76173e6c..33411dad 100644 --- a/lib/oauthlib/openid/connect/core/grant_types/base.py +++ b/lib/oauthlib/openid/connect/core/grant_types/base.py @@ -8,7 +8,6 @@ from oauthlib.oauth2.rfc6749.errors import ( ConsentRequired, InvalidRequestError, LoginRequired, ) - log = logging.getLogger(__name__) diff --git a/lib/oauthlib/openid/connect/core/grant_types/dispatchers.py b/lib/oauthlib/openid/connect/core/grant_types/dispatchers.py index 2734c387..5aa7d469 100644 --- a/lib/oauthlib/openid/connect/core/grant_types/dispatchers.py +++ b/lib/oauthlib/openid/connect/core/grant_types/dispatchers.py @@ -84,7 +84,7 @@ class AuthorizationTokenGrantDispatcher(Dispatcher): code = parameters.get('code', None) redirect_uri = parameters.get('redirect_uri', None) - # If code is not pressent fallback to `default_grant` which will + # If code is not present fallback to `default_grant` which will # raise an error for the missing `code` in `create_token_response` step. if code: scopes = self.request_validator.get_authorization_code_scopes(client_id, code, redirect_uri, request) diff --git a/lib/oauthlib/openid/connect/core/tokens.py b/lib/oauthlib/openid/connect/core/tokens.py index a312e2d2..936ab52e 100644 --- a/lib/oauthlib/openid/connect/core/tokens.py +++ b/lib/oauthlib/openid/connect/core/tokens.py @@ -4,7 +4,9 @@ authlib.openid.connect.core.tokens This module contains methods for adding JWT tokens to requests. """ -from oauthlib.oauth2.rfc6749.tokens import TokenBase, random_token_generator, get_token_from_header +from oauthlib.oauth2.rfc6749.tokens import ( + TokenBase, get_token_from_header, random_token_generator, +) class JWTToken(TokenBase): diff --git a/lib/oauthlib/uri_validate.py b/lib/oauthlib/uri_validate.py index 8a6d9c27..a6fe0fb2 100644 --- a/lib/oauthlib/uri_validate.py +++ b/lib/oauthlib/uri_validate.py @@ -66,7 +66,7 @@ IPv4address = r"%(dec_octet)s \. %(dec_octet)s \. %(dec_octet)s \. %(dec_octet)s ) # IPv6address -IPv6address = r"([A-Fa-f0-9:]+:+)+[A-Fa-f0-9]+" +IPv6address = r"([A-Fa-f0-9:]+[:$])[A-Fa-f0-9]{1,4}" # IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) IPvFuture = r"v %(HEXDIG)s+ \. (?: %(unreserved)s | %(sub_delims)s | : )+" % locals() diff --git a/lib/requests_oauthlib/__init__.py b/lib/requests_oauthlib/__init__.py index 0d3e49f9..865d72fb 100644 --- a/lib/requests_oauthlib/__init__.py +++ b/lib/requests_oauthlib/__init__.py @@ -1,3 +1,4 @@ +# ruff: noqa: F401 import logging from .oauth1_auth import OAuth1 @@ -5,7 +6,7 @@ from .oauth1_session import OAuth1Session from .oauth2_auth import OAuth2 from .oauth2_session import OAuth2Session, TokenUpdated -__version__ = "1.3.1" +__version__ = "2.0.0" import requests diff --git a/lib/requests_oauthlib/compliance_fixes/__init__.py b/lib/requests_oauthlib/compliance_fixes/__init__.py index 0e8e3ac8..8815ea0b 100644 --- a/lib/requests_oauthlib/compliance_fixes/__init__.py +++ b/lib/requests_oauthlib/compliance_fixes/__init__.py @@ -1,5 +1,4 @@ -from __future__ import absolute_import - +# ruff: noqa: F401 from .facebook import facebook_compliance_fix from .fitbit import fitbit_compliance_fix from .slack import slack_compliance_fix diff --git a/lib/requests_oauthlib/compliance_fixes/douban.py b/lib/requests_oauthlib/compliance_fixes/douban.py index ecc57b08..c8b99c72 100644 --- a/lib/requests_oauthlib/compliance_fixes/douban.py +++ b/lib/requests_oauthlib/compliance_fixes/douban.py @@ -1,14 +1,12 @@ import json -from oauthlib.common import to_unicode - def douban_compliance_fix(session): def fix_token_type(r): token = json.loads(r.text) token.setdefault("token_type", "Bearer") fixed_token = json.dumps(token) - r._content = to_unicode(fixed_token).encode("utf-8") + r._content = fixed_token.encode() return r session._client_default_token_placement = "query" diff --git a/lib/requests_oauthlib/compliance_fixes/ebay.py b/lib/requests_oauthlib/compliance_fixes/ebay.py index 4aa423b3..ef33f391 100644 --- a/lib/requests_oauthlib/compliance_fixes/ebay.py +++ b/lib/requests_oauthlib/compliance_fixes/ebay.py @@ -1,5 +1,4 @@ import json -from oauthlib.common import to_unicode def ebay_compliance_fix(session): @@ -13,7 +12,7 @@ def ebay_compliance_fix(session): if token.get("token_type") in ["Application Access Token", "User Access Token"]: token["token_type"] = "Bearer" fixed_token = json.dumps(token) - response._content = to_unicode(fixed_token).encode("utf-8") + response._content = fixed_token.encode() return response diff --git a/lib/requests_oauthlib/compliance_fixes/facebook.py b/lib/requests_oauthlib/compliance_fixes/facebook.py index 90e79212..f44558a8 100644 --- a/lib/requests_oauthlib/compliance_fixes/facebook.py +++ b/lib/requests_oauthlib/compliance_fixes/facebook.py @@ -1,11 +1,5 @@ from json import dumps - -try: - from urlparse import parse_qsl -except ImportError: - from urllib.parse import parse_qsl - -from oauthlib.common import to_unicode +from urllib.parse import parse_qsl def facebook_compliance_fix(session): @@ -26,7 +20,7 @@ def facebook_compliance_fix(session): if expires is not None: token["expires_in"] = expires token["token_type"] = "Bearer" - r._content = to_unicode(dumps(token)).encode("UTF-8") + r._content = dumps(token).encode() return r session.register_compliance_hook("access_token_response", _compliance_fix) diff --git a/lib/requests_oauthlib/compliance_fixes/fitbit.py b/lib/requests_oauthlib/compliance_fixes/fitbit.py index 7e627024..aacc68bf 100644 --- a/lib/requests_oauthlib/compliance_fixes/fitbit.py +++ b/lib/requests_oauthlib/compliance_fixes/fitbit.py @@ -8,8 +8,6 @@ MissingTokenError. from json import loads, dumps -from oauthlib.common import to_unicode - def fitbit_compliance_fix(session): def _missing_error(r): @@ -17,7 +15,7 @@ def fitbit_compliance_fix(session): if "errors" in token: # Set the error to the first one we have token["error"] = token["errors"][0]["errorType"] - r._content = to_unicode(dumps(token)).encode("UTF-8") + r._content = dumps(token).encode() return r session.register_compliance_hook("access_token_response", _missing_error) diff --git a/lib/requests_oauthlib/compliance_fixes/instagram.py b/lib/requests_oauthlib/compliance_fixes/instagram.py index 4e07fe08..7d5a2ad4 100644 --- a/lib/requests_oauthlib/compliance_fixes/instagram.py +++ b/lib/requests_oauthlib/compliance_fixes/instagram.py @@ -1,7 +1,4 @@ -try: - from urlparse import urlparse, parse_qs -except ImportError: - from urllib.parse import urlparse, parse_qs +from urllib.parse import urlparse, parse_qs from oauthlib.common import add_params_to_uri diff --git a/lib/requests_oauthlib/compliance_fixes/mailchimp.py b/lib/requests_oauthlib/compliance_fixes/mailchimp.py index c69ce9fd..0d602659 100644 --- a/lib/requests_oauthlib/compliance_fixes/mailchimp.py +++ b/lib/requests_oauthlib/compliance_fixes/mailchimp.py @@ -1,21 +1,19 @@ import json -from oauthlib.common import to_unicode - def mailchimp_compliance_fix(session): def _null_scope(r): token = json.loads(r.text) if "scope" in token and token["scope"] is None: token.pop("scope") - r._content = to_unicode(json.dumps(token)).encode("utf-8") + r._content = json.dumps(token).encode() return r def _non_zero_expiration(r): token = json.loads(r.text) if "expires_in" in token and token["expires_in"] == 0: token["expires_in"] = 3600 - r._content = to_unicode(json.dumps(token)).encode("utf-8") + r._content = json.dumps(token).encode() return r session.register_compliance_hook("access_token_response", _null_scope) diff --git a/lib/requests_oauthlib/compliance_fixes/plentymarkets.py b/lib/requests_oauthlib/compliance_fixes/plentymarkets.py index 9f605f05..859f0566 100644 --- a/lib/requests_oauthlib/compliance_fixes/plentymarkets.py +++ b/lib/requests_oauthlib/compliance_fixes/plentymarkets.py @@ -1,8 +1,6 @@ from json import dumps, loads import re -from oauthlib.common import to_unicode - def plentymarkets_compliance_fix(session): def _to_snake_case(n): @@ -22,7 +20,7 @@ def plentymarkets_compliance_fix(session): for k, v in token.items(): fixed_token[_to_snake_case(k)] = v - r._content = to_unicode(dumps(fixed_token)).encode("UTF-8") + r._content = dumps(fixed_token).encode() return r session.register_compliance_hook("access_token_response", _compliance_fix) diff --git a/lib/requests_oauthlib/compliance_fixes/slack.py b/lib/requests_oauthlib/compliance_fixes/slack.py index 3f574b03..9095a470 100644 --- a/lib/requests_oauthlib/compliance_fixes/slack.py +++ b/lib/requests_oauthlib/compliance_fixes/slack.py @@ -1,7 +1,4 @@ -try: - from urlparse import urlparse, parse_qs -except ImportError: - from urllib.parse import urlparse, parse_qs +from urllib.parse import urlparse, parse_qs from oauthlib.common import add_params_to_uri diff --git a/lib/requests_oauthlib/compliance_fixes/weibo.py b/lib/requests_oauthlib/compliance_fixes/weibo.py index 6733abeb..f1623fd6 100644 --- a/lib/requests_oauthlib/compliance_fixes/weibo.py +++ b/lib/requests_oauthlib/compliance_fixes/weibo.py @@ -1,13 +1,11 @@ from json import loads, dumps -from oauthlib.common import to_unicode - def weibo_compliance_fix(session): def _missing_token_type(r): token = loads(r.text) token["token_type"] = "Bearer" - r._content = to_unicode(dumps(token)).encode("UTF-8") + r._content = dumps(token).encode() return r session._client.default_token_placement = "query" diff --git a/lib/requests_oauthlib/oauth1_auth.py b/lib/requests_oauthlib/oauth1_auth.py index cfbbd590..f8c0bd6e 100644 --- a/lib/requests_oauthlib/oauth1_auth.py +++ b/lib/requests_oauthlib/oauth1_auth.py @@ -1,20 +1,15 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - import logging from oauthlib.common import extract_params from oauthlib.oauth1 import Client, SIGNATURE_HMAC, SIGNATURE_TYPE_AUTH_HEADER from oauthlib.oauth1 import SIGNATURE_TYPE_BODY -from requests.compat import is_py3 from requests.utils import to_native_string from requests.auth import AuthBase CONTENT_TYPE_FORM_URLENCODED = "application/x-www-form-urlencoded" CONTENT_TYPE_MULTI_PART = "multipart/form-data" -if is_py3: - unicode = str log = logging.getLogger(__name__) @@ -83,7 +78,7 @@ class OAuth1(AuthBase): or self.client.signature_type == SIGNATURE_TYPE_BODY ): content_type = CONTENT_TYPE_FORM_URLENCODED - if not isinstance(content_type, unicode): + if not isinstance(content_type, str): content_type = content_type.decode("utf-8") is_form_encoded = CONTENT_TYPE_FORM_URLENCODED in content_type @@ -96,17 +91,17 @@ class OAuth1(AuthBase): if is_form_encoded: r.headers["Content-Type"] = CONTENT_TYPE_FORM_URLENCODED r.url, headers, r.body = self.client.sign( - unicode(r.url), unicode(r.method), r.body or "", r.headers + str(r.url), str(r.method), r.body or "", r.headers ) elif self.force_include_body: # To allow custom clients to work on non form encoded bodies. r.url, headers, r.body = self.client.sign( - unicode(r.url), unicode(r.method), r.body or "", r.headers + str(r.url), str(r.method), r.body or "", r.headers ) else: # Omit body data in the signing of non form-encoded requests r.url, headers, _ = self.client.sign( - unicode(r.url), unicode(r.method), None, r.headers + str(r.url), str(r.method), None, r.headers ) r.prepare_headers(headers) diff --git a/lib/requests_oauthlib/oauth1_session.py b/lib/requests_oauthlib/oauth1_session.py index 88f2853c..7625c808 100644 --- a/lib/requests_oauthlib/oauth1_session.py +++ b/lib/requests_oauthlib/oauth1_session.py @@ -1,9 +1,4 @@ -from __future__ import unicode_literals - -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse +from urllib.parse import urlparse import logging @@ -85,7 +80,7 @@ class OAuth1Session(requests.Session): 'https://api.twitter.com/oauth/authorize?oauth_token=sdf0o9823sjdfsdf&oauth_callback=https%3A%2F%2F127.0.0.1%2Fcallback' >>> >>> # Third step. Fetch the access token - >>> redirect_response = raw_input('Paste the full redirect URL here.') + >>> redirect_response = input('Paste the full redirect URL here.') >>> oauth_session.parse_authorization_response(redirect_response) { 'oauth_token: 'kjerht2309u', @@ -258,7 +253,7 @@ class OAuth1Session(requests.Session): return add_params_to_uri(url, kwargs.items()) def fetch_request_token(self, url, realm=None, **request_kwargs): - r"""Fetch a request token. + """Fetch a request token. This is the first step in the OAuth 1 workflow. A request token is obtained by making a signed post request to url. The token is then @@ -267,7 +262,7 @@ class OAuth1Session(requests.Session): :param url: The request token endpoint URL. :param realm: A list of realms to request access to. - :param \*\*request_kwargs: Optional arguments passed to ''post'' + :param request_kwargs: Optional arguments passed to ''post'' function in ''requests.Session'' :returns: The response in dict format. diff --git a/lib/requests_oauthlib/oauth2_auth.py b/lib/requests_oauthlib/oauth2_auth.py index b880f72f..f19f52ac 100644 --- a/lib/requests_oauthlib/oauth2_auth.py +++ b/lib/requests_oauthlib/oauth2_auth.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals from oauthlib.oauth2 import WebApplicationClient, InsecureTransportError from oauthlib.oauth2 import is_secure_transport from requests.auth import AuthBase diff --git a/lib/requests_oauthlib/oauth2_session.py b/lib/requests_oauthlib/oauth2_session.py index db446808..93cc4d7b 100644 --- a/lib/requests_oauthlib/oauth2_session.py +++ b/lib/requests_oauthlib/oauth2_session.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import logging from oauthlib.common import generate_token, urldecode @@ -46,6 +44,7 @@ class OAuth2Session(requests.Session): token=None, state=None, token_updater=None, + pkce=None, **kwargs ): """Construct a new OAuth 2 client session. @@ -72,18 +71,23 @@ class OAuth2Session(requests.Session): set a TokenUpdated warning will be raised when a token has been refreshed. This warning will carry the token in its token argument. + :param pkce: Set "S256" or "plain" to enable PKCE. Default is disabled. :param kwargs: Arguments to pass to the Session constructor. """ super(OAuth2Session, self).__init__(**kwargs) self._client = client or WebApplicationClient(client_id, token=token) self.token = token or {} - self.scope = scope + self._scope = scope self.redirect_uri = redirect_uri self.state = state or generate_token self._state = state self.auto_refresh_url = auto_refresh_url self.auto_refresh_kwargs = auto_refresh_kwargs or {} self.token_updater = token_updater + self._pkce = pkce + + if self._pkce not in ["S256", "plain", None]: + raise AttributeError("Wrong value for {}(.., pkce={})".format(self.__class__, self._pkce)) # Ensure that requests doesn't do any automatic auth. See #278. # The default behavior can be re-enabled by setting auth to None. @@ -95,8 +99,24 @@ class OAuth2Session(requests.Session): "access_token_response": set(), "refresh_token_response": set(), "protected_request": set(), + "refresh_token_request": set(), + "access_token_request": set(), } + @property + def scope(self): + """By default the scope from the client is used, except if overridden""" + if self._scope is not None: + return self._scope + elif self._client is not None: + return self._client.scope + else: + return None + + @scope.setter + def scope(self, scope): + self._scope = scope + def new_state(self): """Generates a state string to be used in authorizations.""" try: @@ -161,6 +181,13 @@ class OAuth2Session(requests.Session): :return: authorization_url, state """ state = state or self.new_state() + if self._pkce: + self._code_verifier = self._client.create_code_verifier(43) + kwargs["code_challenge_method"] = self._pkce + kwargs["code_challenge"] = self._client.create_code_challenge( + code_verifier=self._code_verifier, + code_challenge_method=self._pkce + ) return ( self._client.prepare_request_uri( url, @@ -185,7 +212,7 @@ class OAuth2Session(requests.Session): force_querystring=False, timeout=None, headers=None, - verify=True, + verify=None, proxies=None, include_client_id=None, client_secret=None, @@ -252,6 +279,13 @@ class OAuth2Session(requests.Session): "Please supply either code or " "authorization_response parameters." ) + if self._pkce: + if self._code_verifier is None: + raise ValueError( + "Code verifier is not found, authorization URL must be generated before" + ) + kwargs["code_verifier"] = self._code_verifier + # Earlier versions of this library build an HTTPBasicAuth header out of # `username` and `password`. The RFC states, however these attributes # must be in the request body and not the header. @@ -325,7 +359,7 @@ class OAuth2Session(requests.Session): headers = headers or { "Accept": "application/json", - "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", + "Content-Type": "application/x-www-form-urlencoded", } self.token = {} request_kwargs = {} @@ -338,6 +372,12 @@ class OAuth2Session(requests.Session): else: raise ValueError("The method kwarg must be POST or GET.") + for hook in self.compliance_hook["access_token_request"]: + log.debug("Invoking access_token_request hook %s.", hook) + token_url, headers, request_kwargs = hook( + token_url, headers, request_kwargs + ) + r = self.request( method=method, url=token_url, @@ -388,7 +428,7 @@ class OAuth2Session(requests.Session): auth=None, timeout=None, headers=None, - verify=True, + verify=None, proxies=None, **kwargs ): @@ -426,9 +466,13 @@ class OAuth2Session(requests.Session): if headers is None: headers = { "Accept": "application/json", - "Content-Type": ("application/x-www-form-urlencoded;charset=UTF-8"), + "Content-Type": ("application/x-www-form-urlencoded"), } + for hook in self.compliance_hook["refresh_token_request"]: + log.debug("Invoking refresh_token_request hook %s.", hook) + token_url, headers, body = hook(token_url, headers, body) + r = self.post( token_url, data=dict(urldecode(body)), @@ -450,7 +494,7 @@ class OAuth2Session(requests.Session): r = hook(r) self.token = self._client.parse_request_body_response(r.text, scope=self.scope) - if not "refresh_token" in self.token: + if "refresh_token" not in self.token: log.debug("No new refresh token given. Re-using old.") self.token["refresh_token"] = refresh_token return self.token @@ -464,6 +508,7 @@ class OAuth2Session(requests.Session): withhold_token=False, client_id=None, client_secret=None, + files=None, **kwargs ): """Intercept all requests and add the OAuth 2 token if present.""" @@ -519,7 +564,7 @@ class OAuth2Session(requests.Session): log.debug("Supplying headers %s and data %s", headers, data) log.debug("Passing through key word arguments %s.", kwargs) return super(OAuth2Session, self).request( - method, url, headers=headers, data=data, **kwargs + method, url, headers=headers, data=data, files=files, **kwargs ) def register_compliance_hook(self, hook_type, hook): @@ -529,6 +574,8 @@ class OAuth2Session(requests.Session): access_token_response invoked before token parsing. refresh_token_response invoked before refresh token parsing. protected_request invoked before making a request. + access_token_request invoked before making a token fetch request. + refresh_token_request invoked before making a refresh request. If you find a new hook is needed please send a GitHub PR request or open an issue.