Update arrow-1.2.0

2025-07-06 13:11:15 -07:00 · 2021-10-14 20:43:42 -07:00 · 2021-10-14 20:43:42 -07:00 · 9a54fb9a44
commit 9a54fb9a44
parent 3b645cf6c3
11 changed files with 7793 additions and 2322 deletions
--- a/lib/arrow/parser.py
+++ b/lib/arrow/parser.py
@ -1,205 +1,555 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
+"""Provides the :class:`Arrow <arrow.parser.DateTimeParser>` class, a better way to parse datetime strings."""

-from datetime import datetime
-from dateutil import tz
 import re
+import sys
+from datetime import datetime, timedelta
+from datetime import tzinfo as dt_tzinfo
+from functools import lru_cache
+from typing import (
+    Any,
+    ClassVar,
+    Dict,
+    Iterable,
+    List,
+    Match,
+    Optional,
+    Pattern,
+    SupportsFloat,
+    SupportsInt,
+    Tuple,
+    Union,
+    cast,
+    overload,
+)
+
+from dateutil import tz
+
 from arrow import locales
+from arrow.constants import DEFAULT_LOCALE
+from arrow.util import next_weekday, normalize_timestamp
+
+if sys.version_info < (3, 8):  # pragma: no cover
+    from typing_extensions import Literal, TypedDict
+else:
+    from typing import Literal, TypedDict  # pragma: no cover


-class ParserError(RuntimeError):
+class ParserError(ValueError):
    pass


-class DateTimeParser(object):
-
-    _FORMAT_RE = re.compile('(YYY?Y?|MM?M?M?|Do|DD?D?D?|d?d?d?d|HH?|hh?|mm?|ss?|S+|ZZ?Z?|a|A|X)')
-    _ESCAPE_RE = re.compile('\[[^\[\]]*\]')
-
-    _ONE_OR_MORE_DIGIT_RE = re.compile('\d+')
-    _ONE_OR_TWO_DIGIT_RE = re.compile('\d{1,2}')
-    _FOUR_DIGIT_RE = re.compile('\d{4}')
-    _TWO_DIGIT_RE = re.compile('\d{2}')
-    _TZ_RE = re.compile('[+\-]?\d{2}:?(\d{2})?')
-    _TZ_NAME_RE = re.compile('\w[\w+\-/]+')
+# Allows for ParserErrors to be propagated from _build_datetime()
+# when day_of_year errors occur.
+# Before this, the ParserErrors were caught by the try/except in
+# _parse_multiformat() and the appropriate error message was not
+# transmitted to the user.
+class ParserMatchError(ParserError):
+    pass


-    _BASE_INPUT_RE_MAP = {
-        'YYYY': _FOUR_DIGIT_RE,
-        'YY': _TWO_DIGIT_RE,
-        'MM': _TWO_DIGIT_RE,
-        'M': _ONE_OR_TWO_DIGIT_RE,
-        'DD': _TWO_DIGIT_RE,
-        'D': _ONE_OR_TWO_DIGIT_RE,
-        'HH': _TWO_DIGIT_RE,
-        'H': _ONE_OR_TWO_DIGIT_RE,
-        'hh': _TWO_DIGIT_RE,
-        'h': _ONE_OR_TWO_DIGIT_RE,
-        'mm': _TWO_DIGIT_RE,
-        'm': _ONE_OR_TWO_DIGIT_RE,
-        'ss': _TWO_DIGIT_RE,
-        's': _ONE_OR_TWO_DIGIT_RE,
-        'X': re.compile('\d+'),
-        'ZZZ': _TZ_NAME_RE,
-        'ZZ': _TZ_RE,
-        'Z': _TZ_RE,
-        'S': _ONE_OR_MORE_DIGIT_RE,
+_WEEKDATE_ELEMENT = Union[str, bytes, SupportsInt, bytearray]
+
+_FORMAT_TYPE = Literal[
+    "YYYY",
+    "YY",
+    "MM",
+    "M",
+    "DDDD",
+    "DDD",
+    "DD",
+    "D",
+    "HH",
+    "H",
+    "hh",
+    "h",
+    "mm",
+    "m",
+    "ss",
+    "s",
+    "X",
+    "x",
+    "ZZZ",
+    "ZZ",
+    "Z",
+    "S",
+    "W",
+    "MMMM",
+    "MMM",
+    "Do",
+    "dddd",
+    "ddd",
+    "d",
+    "a",
+    "A",
+]
+
+
+class _Parts(TypedDict, total=False):
+    year: int
+    month: int
+    day_of_year: int
+    day: int
+    hour: int
+    minute: int
+    second: int
+    microsecond: int
+    timestamp: float
+    expanded_timestamp: int
+    tzinfo: dt_tzinfo
+    am_pm: Literal["am", "pm"]
+    day_of_week: int
+    weekdate: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]]
+
+
+class DateTimeParser:
+    _FORMAT_RE: ClassVar[Pattern[str]] = re.compile(
+        r"(YYY?Y?|MM?M?M?|Do|DD?D?D?|d?d?d?d|HH?|hh?|mm?|ss?|S+|ZZ?Z?|a|A|x|X|W)"
+    )
+    _ESCAPE_RE: ClassVar[Pattern[str]] = re.compile(r"\[[^\[\]]*\]")
+
+    _ONE_OR_TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,2}")
+    _ONE_OR_TWO_OR_THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,3}")
+    _ONE_OR_MORE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d+")
+    _TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{2}")
+    _THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{3}")
+    _FOUR_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{4}")
+    _TZ_Z_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:(\d{2}))?|Z")
+    _TZ_ZZ_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:\:(\d{2}))?|Z")
+    _TZ_NAME_RE: ClassVar[Pattern[str]] = re.compile(r"\w[\w+\-/]+")
+    # NOTE: timestamps cannot be parsed from natural language strings (by removing the ^...$) because it will
+    # break cases like "15 Jul 2000" and a format list (see issue #447)
+    _TIMESTAMP_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+\.?\d+$")
+    _TIMESTAMP_EXPANDED_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+$")
+    _TIME_RE: ClassVar[Pattern[str]] = re.compile(
+        r"^(\d{2})(?:\:?(\d{2}))?(?:\:?(\d{2}))?(?:([\.\,])(\d+))?$"
+    )
+    _WEEK_DATE_RE: ClassVar[Pattern[str]] = re.compile(
+        r"(?P<year>\d{4})[\-]?W(?P<week>\d{2})[\-]?(?P<day>\d)?"
+    )
+
+    _BASE_INPUT_RE_MAP: ClassVar[Dict[_FORMAT_TYPE, Pattern[str]]] = {
+        "YYYY": _FOUR_DIGIT_RE,
+        "YY": _TWO_DIGIT_RE,
+        "MM": _TWO_DIGIT_RE,
+        "M": _ONE_OR_TWO_DIGIT_RE,
+        "DDDD": _THREE_DIGIT_RE,
+        "DDD": _ONE_OR_TWO_OR_THREE_DIGIT_RE,
+        "DD": _TWO_DIGIT_RE,
+        "D": _ONE_OR_TWO_DIGIT_RE,
+        "HH": _TWO_DIGIT_RE,
+        "H": _ONE_OR_TWO_DIGIT_RE,
+        "hh": _TWO_DIGIT_RE,
+        "h": _ONE_OR_TWO_DIGIT_RE,
+        "mm": _TWO_DIGIT_RE,
+        "m": _ONE_OR_TWO_DIGIT_RE,
+        "ss": _TWO_DIGIT_RE,
+        "s": _ONE_OR_TWO_DIGIT_RE,
+        "X": _TIMESTAMP_RE,
+        "x": _TIMESTAMP_EXPANDED_RE,
+        "ZZZ": _TZ_NAME_RE,
+        "ZZ": _TZ_ZZ_RE,
+        "Z": _TZ_Z_RE,
+        "S": _ONE_OR_MORE_DIGIT_RE,
+        "W": _WEEK_DATE_RE,
    }

-    MARKERS = ['YYYY', 'MM', 'DD']
-    SEPARATORS = ['-', '/', '.']
+    SEPARATORS: ClassVar[List[str]] = ["-", "/", "."]

-    def __init__(self, locale='en_us'):
+    locale: locales.Locale
+    _input_re_map: Dict[_FORMAT_TYPE, Pattern[str]]
+
+    def __init__(self, locale: str = DEFAULT_LOCALE, cache_size: int = 0) -> None:

        self.locale = locales.get_locale(locale)
        self._input_re_map = self._BASE_INPUT_RE_MAP.copy()
-        self._input_re_map.update({
-            'MMMM': self._choice_re(self.locale.month_names[1:], re.IGNORECASE),
-            'MMM': self._choice_re(self.locale.month_abbreviations[1:],
-                                   re.IGNORECASE),
-            'Do': re.compile(self.locale.ordinal_day_re),
-            'dddd': self._choice_re(self.locale.day_names[1:], re.IGNORECASE),
-            'ddd': self._choice_re(self.locale.day_abbreviations[1:],
-                                   re.IGNORECASE),
-            'd' : re.compile("[1-7]"),
-            'a': self._choice_re(
-                (self.locale.meridians['am'], self.locale.meridians['pm'])
-            ),
-            # note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to
-            # ensure backwards compatibility of this token
-            'A': self._choice_re(self.locale.meridians.values())
-        })
+        self._input_re_map.update(
+            {
+                "MMMM": self._generate_choice_re(
+                    self.locale.month_names[1:], re.IGNORECASE
+                ),
+                "MMM": self._generate_choice_re(
+                    self.locale.month_abbreviations[1:], re.IGNORECASE
+                ),
+                "Do": re.compile(self.locale.ordinal_day_re),
+                "dddd": self._generate_choice_re(
+                    self.locale.day_names[1:], re.IGNORECASE
+                ),
+                "ddd": self._generate_choice_re(
+                    self.locale.day_abbreviations[1:], re.IGNORECASE
+                ),
+                "d": re.compile(r"[1-7]"),
+                "a": self._generate_choice_re(
+                    (self.locale.meridians["am"], self.locale.meridians["pm"])
+                ),
+                # note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to
+                # ensure backwards compatibility of this token
+                "A": self._generate_choice_re(self.locale.meridians.values()),
+            }
+        )
+        if cache_size > 0:
+            self._generate_pattern_re = lru_cache(maxsize=cache_size)(  # type: ignore
+                self._generate_pattern_re
+            )

-    def parse_iso(self, string):
+    # TODO: since we support more than ISO 8601, we should rename this function
+    # IDEA: break into multiple functions
+    def parse_iso(
+        self, datetime_string: str, normalize_whitespace: bool = False
+    ) -> datetime:

-        has_time = 'T' in string or ' ' in string.strip()
-        space_divider = ' ' in string.strip()
+        if normalize_whitespace:
+            datetime_string = re.sub(r"\s+", " ", datetime_string.strip())
+
+        has_space_divider = " " in datetime_string
+        has_t_divider = "T" in datetime_string
+
+        num_spaces = datetime_string.count(" ")
+        if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0:
+            raise ParserError(
+                f"Expected an ISO 8601-like string, but was given {datetime_string!r}. "
+                "Try passing in a format string to resolve this."
+            )
+
+        has_time = has_space_divider or has_t_divider
+        has_tz = False
+
+        # date formats (ISO 8601 and others) to test against
+        # NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used)
+        formats = [
+            "YYYY-MM-DD",
+            "YYYY-M-DD",
+            "YYYY-M-D",
+            "YYYY/MM/DD",
+            "YYYY/M/DD",
+            "YYYY/M/D",
+            "YYYY.MM.DD",
+            "YYYY.M.DD",
+            "YYYY.M.D",
+            "YYYYMMDD",
+            "YYYY-DDDD",
+            "YYYYDDDD",
+            "YYYY-MM",
+            "YYYY/MM",
+            "YYYY.MM",
+            "YYYY",
+            "W",
+        ]

        if has_time:
-            if space_divider:
-                date_string, time_string = string.split(' ', 1)
+
+            if has_space_divider:
+                date_string, time_string = datetime_string.split(" ", 1)
            else:
-                date_string, time_string = string.split('T', 1)
-            time_parts = re.split('[+-]', time_string, 1)
-            has_tz = len(time_parts) > 1
-            has_seconds = time_parts[0].count(':') > 1
-            has_subseconds = re.search('[.,]', time_parts[0])
+                date_string, time_string = datetime_string.split("T", 1)
+
+            time_parts = re.split(r"[\+\-Z]", time_string, 1, re.IGNORECASE)
+
+            time_components: Optional[Match[str]] = self._TIME_RE.match(time_parts[0])
+
+            if time_components is None:
+                raise ParserError(
+                    "Invalid time component provided. "
+                    "Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format."
+                )
+
+            (
+                hours,
+                minutes,
+                seconds,
+                subseconds_sep,
+                subseconds,
+            ) = time_components.groups()
+
+            has_tz = len(time_parts) == 2
+            has_minutes = minutes is not None
+            has_seconds = seconds is not None
+            has_subseconds = subseconds is not None
+
+            is_basic_time_format = ":" not in time_parts[0]
+            tz_format = "Z"
+
+            # use 'ZZ' token instead since tz offset is present in non-basic format
+            if has_tz and ":" in time_parts[1]:
+                tz_format = "ZZ"
+
+            time_sep = "" if is_basic_time_format else ":"

            if has_subseconds:
-                formats = ['YYYY-MM-DDTHH:mm:ss%sS' % has_subseconds.group()]
+                time_string = "HH{time_sep}mm{time_sep}ss{subseconds_sep}S".format(
+                    time_sep=time_sep, subseconds_sep=subseconds_sep
+                )
            elif has_seconds:
-                formats = ['YYYY-MM-DDTHH:mm:ss']
+                time_string = "HH{time_sep}mm{time_sep}ss".format(time_sep=time_sep)
+            elif has_minutes:
+                time_string = f"HH{time_sep}mm"
            else:
-                formats = ['YYYY-MM-DDTHH:mm']
-        else:
-            has_tz = False
-            # generate required formats: YYYY-MM-DD, YYYY-MM-DD, YYYY
-            # using various separators: -, /, .
-            l = len(self.MARKERS)
-            formats = [separator.join(self.MARKERS[:l-i])
-                        for i in range(l)
-                        for separator in self.SEPARATORS]
+                time_string = "HH"
+
+            if has_space_divider:
+                formats = [f"{f} {time_string}" for f in formats]
+            else:
+                formats = [f"{f}T{time_string}" for f in formats]

        if has_time and has_tz:
-            formats = [f + 'Z' for f in formats]
+            # Add "Z" or "ZZ" to the format strings to indicate to
+            # _parse_token() that a timezone needs to be parsed
+            formats = [f"{f}{tz_format}" for f in formats]

-        if space_divider:
-            formats = [item.replace('T', ' ', 1) for item in formats]
+        return self._parse_multiformat(datetime_string, formats)

-        return self._parse_multiformat(string, formats)
+    def parse(
+        self,
+        datetime_string: str,
+        fmt: Union[List[str], str],
+        normalize_whitespace: bool = False,
+    ) -> datetime:

-    def parse(self, string, fmt):
+        if normalize_whitespace:
+            datetime_string = re.sub(r"\s+", " ", datetime_string)

        if isinstance(fmt, list):
-            return self._parse_multiformat(string, fmt)
+            return self._parse_multiformat(datetime_string, fmt)
+
+        try:
+            fmt_tokens: List[_FORMAT_TYPE]
+            fmt_pattern_re: Pattern[str]
+            fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)
+        except re.error as e:
+            raise ParserMatchError(
+                f"Failed to generate regular expression pattern: {e}."
+            )
+
+        match = fmt_pattern_re.search(datetime_string)
+
+        if match is None:
+            raise ParserMatchError(
+                f"Failed to match {fmt!r} when parsing {datetime_string!r}."
+            )
+
+        parts: _Parts = {}
+        for token in fmt_tokens:
+            value: Union[Tuple[str, str, str], str]
+            if token == "Do":
+                value = match.group("value")
+            elif token == "W":
+                value = (match.group("year"), match.group("week"), match.group("day"))
+            else:
+                value = match.group(token)
+
+            if value is None:
+                raise ParserMatchError(
+                    f"Unable to find a match group for the specified token {token!r}."
+                )
+
+            self._parse_token(token, value, parts)  # type: ignore
+
+        return self._build_datetime(parts)
+
+    def _generate_pattern_re(self, fmt: str) -> Tuple[List[_FORMAT_TYPE], Pattern[str]]:

        # fmt is a string of tokens like 'YYYY-MM-DD'
        # we construct a new string by replacing each
        # token by its pattern:
        # 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
-        tokens = []
+        tokens: List[_FORMAT_TYPE] = []
        offset = 0

+        # Escape all special RegEx chars
+        escaped_fmt = re.escape(fmt)
+
        # Extract the bracketed expressions to be reinserted later.
-        escaped_fmt = re.sub(self._ESCAPE_RE, "#" , fmt)
+        escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt)
+
        # Any number of S is the same as one.
-        escaped_fmt = re.sub('S+', 'S', escaped_fmt)
+        # TODO: allow users to specify the number of digits to parse
+        escaped_fmt = re.sub(r"S+", "S", escaped_fmt)
+
        escaped_data = re.findall(self._ESCAPE_RE, fmt)

        fmt_pattern = escaped_fmt

        for m in self._FORMAT_RE.finditer(escaped_fmt):
-            token = m.group(0)
+            token: _FORMAT_TYPE = cast(_FORMAT_TYPE, m.group(0))
            try:
                input_re = self._input_re_map[token]
            except KeyError:
-                raise ParserError('Unrecognized token \'{0}\''.format(token))
-            input_pattern = '(?P<{0}>{1})'.format(token, input_re.pattern)
+                raise ParserError(f"Unrecognized token {token!r}.")
+            input_pattern = f"(?P<{token}>{input_re.pattern})"
            tokens.append(token)
            # a pattern doesn't have the same length as the token
            # it replaces! We keep the difference in the offset variable.
            # This works because the string is scanned left-to-right and matches
            # are returned in the order found by finditer.
-            fmt_pattern = fmt_pattern[:m.start() + offset] + input_pattern + fmt_pattern[m.end() + offset:]
+            fmt_pattern = (
+                fmt_pattern[: m.start() + offset]
+                + input_pattern
+                + fmt_pattern[m.end() + offset :]
+            )
            offset += len(input_pattern) - (m.end() - m.start())

        final_fmt_pattern = ""
-        a = fmt_pattern.split("#")
-        b = escaped_data
+        split_fmt = fmt_pattern.split(r"\#")

-        # Due to the way Python splits, 'a' will always be longer
-        for i in range(len(a)):
-            final_fmt_pattern += a[i]
-            if i < len(b):
-                final_fmt_pattern += b[i][1:-1]
+        # Due to the way Python splits, 'split_fmt' will always be longer
+        for i in range(len(split_fmt)):
+            final_fmt_pattern += split_fmt[i]
+            if i < len(escaped_data):
+                final_fmt_pattern += escaped_data[i][1:-1]

-        match = re.search(final_fmt_pattern, string, flags=re.IGNORECASE)
-        if match is None:
-            raise ParserError('Failed to match \'{0}\' when parsing \'{1}\''.format(final_fmt_pattern, string))
-        parts = {}
-        for token in tokens:
-            if token == 'Do':
-                value = match.group('value')
-            else:
-                value = match.group(token)
-            self._parse_token(token, value, parts)
-        return self._build_datetime(parts)
+        # Wrap final_fmt_pattern in a custom word boundary to strictly
+        # match the formatting pattern and filter out date and time formats
+        # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah,
+        # blah1998-09-12blah. The custom word boundary matches every character
+        # that is not a whitespace character to allow for searching for a date
+        # and time string in a natural language sentence. Therefore, searching
+        # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will
+        # work properly.
+        # Certain punctuation before or after the target pattern such as
+        # "1998-09-12," is permitted. For the full list of valid punctuation,
+        # see the documentation.

-    def _parse_token(self, token, value, parts):
+        starting_word_boundary = (
+            r"(?<!\S\S)"  # Don't have two consecutive non-whitespace characters. This ensures that we allow cases
+            # like .11.25.2019 but not 1.11.25.2019 (for pattern MM.DD.YYYY)
+            r"(?<![^\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)<>\s])"  # This is the list of punctuation that is ok before the
+            # pattern (i.e. "It can't not be these characters before the pattern")
+            r"(\b|^)"
+            # The \b is to block cases like 1201912 but allow 201912 for pattern YYYYMM. The ^ was necessary to allow a
+            # negative number through i.e. before epoch numbers
+        )
+        ending_word_boundary = (
+            r"(?=[\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)\<\>]?"  # Positive lookahead stating that these punctuation marks
+            # can appear after the pattern at most 1 time
+            r"(?!\S))"  # Don't allow any non-whitespace character after the punctuation
+        )
+        bounded_fmt_pattern = r"{}{}{}".format(
+            starting_word_boundary, final_fmt_pattern, ending_word_boundary
+        )

-        if token == 'YYYY':
-            parts['year'] = int(value)
-        elif token == 'YY':
+        return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE)
+
+    @overload
+    def _parse_token(
+        self,
+        token: Literal[
+            "YYYY",
+            "YY",
+            "MM",
+            "M",
+            "DDDD",
+            "DDD",
+            "DD",
+            "D",
+            "Do",
+            "HH",
+            "hh",
+            "h",
+            "H",
+            "mm",
+            "m",
+            "ss",
+            "s",
+            "x",
+        ],
+        value: Union[str, bytes, SupportsInt, bytearray],
+        parts: _Parts,
+    ) -> None:
+        ...  # pragma: no cover
+
+    @overload
+    def _parse_token(
+        self,
+        token: Literal["X"],
+        value: Union[str, bytes, SupportsFloat, bytearray],
+        parts: _Parts,
+    ) -> None:
+        ...  # pragma: no cover
+
+    @overload
+    def _parse_token(
+        self,
+        token: Literal["MMMM", "MMM", "dddd", "ddd", "S"],
+        value: Union[str, bytes, bytearray],
+        parts: _Parts,
+    ) -> None:
+        ...  # pragma: no cover
+
+    @overload
+    def _parse_token(
+        self,
+        token: Literal["a", "A", "ZZZ", "ZZ", "Z"],
+        value: Union[str, bytes],
+        parts: _Parts,
+    ) -> None:
+        ...  # pragma: no cover
+
+    @overload
+    def _parse_token(
+        self,
+        token: Literal["W"],
+        value: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]],
+        parts: _Parts,
+    ) -> None:
+        ...  # pragma: no cover
+
+    def _parse_token(
+        self,
+        token: Any,
+        value: Any,
+        parts: _Parts,
+    ) -> None:
+
+        if token == "YYYY":
+            parts["year"] = int(value)
+
+        elif token == "YY":
            value = int(value)
-            parts['year'] = 1900 + value if value > 68 else 2000 + value
+            parts["year"] = 1900 + value if value > 68 else 2000 + value

-        elif token in ['MMMM', 'MMM']:
-            parts['month'] = self.locale.month_number(value.lower())
+        elif token in ["MMMM", "MMM"]:
+            # FIXME: month_number() is nullable
+            parts["month"] = self.locale.month_number(value.lower())  # type: ignore

-        elif token in ['MM', 'M']:
-            parts['month'] = int(value)
+        elif token in ["MM", "M"]:
+            parts["month"] = int(value)

-        elif token in ['DD', 'D']:
-            parts['day'] = int(value)
+        elif token in ["DDDD", "DDD"]:
+            parts["day_of_year"] = int(value)

-        elif token in ['Do']:
-            parts['day'] = int(value)
+        elif token in ["DD", "D"]:
+            parts["day"] = int(value)

-        elif token.upper() in ['HH', 'H']:
-            parts['hour'] = int(value)
+        elif token == "Do":
+            parts["day"] = int(value)

-        elif token in ['mm', 'm']:
-            parts['minute'] = int(value)
+        elif token == "dddd":
+            # locale day names are 1-indexed
+            day_of_week = [x.lower() for x in self.locale.day_names].index(
+                value.lower()
+            )
+            parts["day_of_week"] = day_of_week - 1

-        elif token in ['ss', 's']:
-            parts['second'] = int(value)
+        elif token == "ddd":
+            # locale day abbreviations are 1-indexed
+            day_of_week = [x.lower() for x in self.locale.day_abbreviations].index(
+                value.lower()
+            )
+            parts["day_of_week"] = day_of_week - 1

-        elif token == 'S':
+        elif token.upper() in ["HH", "H"]:
+            parts["hour"] = int(value)
+
+        elif token in ["mm", "m"]:
+            parts["minute"] = int(value)
+
+        elif token in ["ss", "s"]:
+            parts["second"] = int(value)
+
+        elif token == "S":
            # We have the *most significant* digits of an arbitrary-precision integer.
            # We want the six most significant digits as an integer, rounded.
-            # FIXME: add nanosecond support somehow?
-            value = value.ljust(7, str('0'))
+            # IDEA: add nanosecond support somehow? Need datetime support for it first.
+            value = value.ljust(7, "0")

            # floating-point (IEEE-754) defaults to half-to-even rounding
            seventh_digit = int(value[6])
@ -210,119 +560,220 @@ class DateTimeParser(object):
            else:
                rounding = 0

-            parts['microsecond'] = int(value[:6]) + rounding
+            parts["microsecond"] = int(value[:6]) + rounding

-        elif token == 'X':
-            parts['timestamp'] = int(value)
+        elif token == "X":
+            parts["timestamp"] = float(value)

-        elif token in ['ZZZ', 'ZZ', 'Z']:
-            parts['tzinfo'] = TzinfoParser.parse(value)
+        elif token == "x":
+            parts["expanded_timestamp"] = int(value)

-        elif token in ['a', 'A']:
-            if value in (
-                    self.locale.meridians['am'],
-                    self.locale.meridians['AM']
-            ):
-                parts['am_pm'] = 'am'
-            elif value in (
-                    self.locale.meridians['pm'],
-                    self.locale.meridians['PM']
-            ):
-                parts['am_pm'] = 'pm'
+        elif token in ["ZZZ", "ZZ", "Z"]:
+            parts["tzinfo"] = TzinfoParser.parse(value)
+
+        elif token in ["a", "A"]:
+            if value in (self.locale.meridians["am"], self.locale.meridians["AM"]):
+                parts["am_pm"] = "am"
+                if "hour" in parts and not 0 <= parts["hour"] <= 12:
+                    raise ParserMatchError(
+                        f"Hour token value must be between 0 and 12 inclusive for token {token!r}."
+                    )
+            elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]):
+                parts["am_pm"] = "pm"
+        elif token == "W":
+            parts["weekdate"] = value

    @staticmethod
-    def _build_datetime(parts):
+    def _build_datetime(parts: _Parts) -> datetime:
+        weekdate = parts.get("weekdate")

-        timestamp = parts.get('timestamp')
+        if weekdate is not None:

-        if timestamp:
-            tz_utc = tz.tzutc()
-            return datetime.fromtimestamp(timestamp, tz=tz_utc)
+            year, week = int(weekdate[0]), int(weekdate[1])

-        am_pm = parts.get('am_pm')
-        hour = parts.get('hour', 0)
+            if weekdate[2] is not None:
+                _day = int(weekdate[2])
+            else:
+                # day not given, default to 1
+                _day = 1

-        if am_pm == 'pm' and hour < 12:
+            date_string = f"{year}-{week}-{_day}"
+
+            #  tokens for ISO 8601 weekdates
+            dt = datetime.strptime(date_string, "%G-%V-%u")
+
+            parts["year"] = dt.year
+            parts["month"] = dt.month
+            parts["day"] = dt.day
+
+        timestamp = parts.get("timestamp")
+
+        if timestamp is not None:
+            return datetime.fromtimestamp(timestamp, tz=tz.tzutc())
+
+        expanded_timestamp = parts.get("expanded_timestamp")
+
+        if expanded_timestamp is not None:
+            return datetime.fromtimestamp(
+                normalize_timestamp(expanded_timestamp),
+                tz=tz.tzutc(),
+            )
+
+        day_of_year = parts.get("day_of_year")
+
+        if day_of_year is not None:
+            _year = parts.get("year")
+            month = parts.get("month")
+            if _year is None:
+                raise ParserError(
+                    "Year component is required with the DDD and DDDD tokens."
+                )
+
+            if month is not None:
+                raise ParserError(
+                    "Month component is not allowed with the DDD and DDDD tokens."
+                )
+
+            date_string = f"{_year}-{day_of_year}"
+            try:
+                dt = datetime.strptime(date_string, "%Y-%j")
+            except ValueError:
+                raise ParserError(
+                    f"The provided day of year {day_of_year!r} is invalid."
+                )
+
+            parts["year"] = dt.year
+            parts["month"] = dt.month
+            parts["day"] = dt.day
+
+        day_of_week: Optional[int] = parts.get("day_of_week")
+        day = parts.get("day")
+
+        # If day is passed, ignore day of week
+        if day_of_week is not None and day is None:
+            year = parts.get("year", 1970)
+            month = parts.get("month", 1)
+            day = 1
+
+            # dddd => first day of week after epoch
+            # dddd YYYY => first day of week in specified year
+            # dddd MM YYYY => first day of week in specified year and month
+            # dddd MM => first day after epoch in specified month
+            next_weekday_dt = next_weekday(datetime(year, month, day), day_of_week)
+            parts["year"] = next_weekday_dt.year
+            parts["month"] = next_weekday_dt.month
+            parts["day"] = next_weekday_dt.day
+
+        am_pm = parts.get("am_pm")
+        hour = parts.get("hour", 0)
+
+        if am_pm == "pm" and hour < 12:
            hour += 12
-        elif am_pm == 'am' and hour == 12:
+        elif am_pm == "am" and hour == 12:
            hour = 0

-        return datetime(year=parts.get('year', 1), month=parts.get('month', 1),
-            day=parts.get('day', 1), hour=hour, minute=parts.get('minute', 0),
-            second=parts.get('second', 0), microsecond=parts.get('microsecond', 0),
-            tzinfo=parts.get('tzinfo'))
+        # Support for midnight at the end of day
+        if hour == 24:
+            if parts.get("minute", 0) != 0:
+                raise ParserError("Midnight at the end of day must not contain minutes")
+            if parts.get("second", 0) != 0:
+                raise ParserError("Midnight at the end of day must not contain seconds")
+            if parts.get("microsecond", 0) != 0:
+                raise ParserError(
+                    "Midnight at the end of day must not contain microseconds"
+                )
+            hour = 0
+            day_increment = 1
+        else:
+            day_increment = 0

-    def _parse_multiformat(self, string, formats):
+        # account for rounding up to 1000000
+        microsecond = parts.get("microsecond", 0)
+        if microsecond == 1000000:
+            microsecond = 0
+            second_increment = 1
+        else:
+            second_increment = 0

-        _datetime = None
+        increment = timedelta(days=day_increment, seconds=second_increment)
+
+        return (
+            datetime(
+                year=parts.get("year", 1),
+                month=parts.get("month", 1),
+                day=parts.get("day", 1),
+                hour=hour,
+                minute=parts.get("minute", 0),
+                second=parts.get("second", 0),
+                microsecond=microsecond,
+                tzinfo=parts.get("tzinfo"),
+            )
+            + increment
+        )
+
+    def _parse_multiformat(self, string: str, formats: Iterable[str]) -> datetime:
+
+        _datetime: Optional[datetime] = None

        for fmt in formats:
            try:
                _datetime = self.parse(string, fmt)
                break
-            except ParserError:
+            except ParserMatchError:
                pass

        if _datetime is None:
-            raise ParserError('Could not match input to any of {0} on \'{1}\''.format(formats, string))
+            supported_formats = ", ".join(formats)
+            raise ParserError(
+                f"Could not match input {string!r} to any of the following formats: {supported_formats}."
+            )

        return _datetime

+    # generates a capture group of choices separated by an OR operator
    @staticmethod
-    def _map_lookup(input_map, key):
-
-        try:
-            return input_map[key]
-        except KeyError:
-            raise ParserError('Could not match "{0}" to {1}'.format(key, input_map))
-
-    @staticmethod
-    def _try_timestamp(string):
-
-        try:
-            return float(string)
-        except:
-            return None
-
-    @staticmethod
-    def _choice_re(choices, flags=0):
-        return re.compile('({0})'.format('|'.join(choices)), flags=flags)
+    def _generate_choice_re(
+        choices: Iterable[str], flags: Union[int, re.RegexFlag] = 0
+    ) -> Pattern[str]:
+        return re.compile(r"({})".format("|".join(choices)), flags=flags)


-class TzinfoParser(object):
-
-    _TZINFO_RE = re.compile('([+\-])?(\d\d):?(\d\d)?')
+class TzinfoParser:
+    _TZINFO_RE: ClassVar[Pattern[str]] = re.compile(
+        r"^([\+\-])?(\d{2})(?:\:?(\d{2}))?$"
+    )

    @classmethod
-    def parse(cls, string):
+    def parse(cls, tzinfo_string: str) -> dt_tzinfo:

-        tzinfo = None
+        tzinfo: Optional[dt_tzinfo] = None

-        if string == 'local':
+        if tzinfo_string == "local":
            tzinfo = tz.tzlocal()

-        elif string in ['utc', 'UTC']:
+        elif tzinfo_string in ["utc", "UTC", "Z"]:
            tzinfo = tz.tzutc()

        else:

-            iso_match = cls._TZINFO_RE.match(string)
+            iso_match = cls._TZINFO_RE.match(tzinfo_string)

            if iso_match:
+                sign: Optional[str]
+                hours: str
+                minutes: Union[str, int, None]
                sign, hours, minutes = iso_match.groups()
-                if minutes is None:
-                    minutes = 0
-                seconds = int(hours) * 3600 + int(minutes) * 60
+                seconds = int(hours) * 3600 + int(minutes or 0) * 60

-                if sign == '-':
+                if sign == "-":
                    seconds *= -1

                tzinfo = tz.tzoffset(None, seconds)

            else:
-                tzinfo = tz.gettz(string)
+                tzinfo = tz.gettz(tzinfo_string)

        if tzinfo is None:
-            raise ParserError('Could not parse timezone expression "{0}"'.format(string))
+            raise ParserError(f"Could not parse timezone expression {tzinfo_string!r}.")

        return tzinfo