Add tokenize_rt-3.2.0

2025-07-06 05:01:14 -07:00 · 2019-11-23 19:02:18 -08:00 · 2019-11-23 19:02:18 -08:00 · 2dcc74d82d
commit 2dcc74d82d
parent d460263b97
1 changed files with 217 additions and 0 deletions
--- a/lib/tokenize_rt.py
+++ b/lib/tokenize_rt.py
@ -0,0 +1,217 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import collections
+import io
+import keyword
+import re
+import tokenize
+from typing import Generator
+from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Pattern
+from typing import Sequence
+from typing import Tuple
+
+
+ESCAPED_NL = 'ESCAPED_NL'
+UNIMPORTANT_WS = 'UNIMPORTANT_WS'
+NON_CODING_TOKENS = frozenset(('COMMENT', ESCAPED_NL, 'NL', UNIMPORTANT_WS))
+
+
+class Offset(collections.namedtuple('Offset', ('line', 'utf8_byte_offset'))):
+    __slots__ = ()
+
+    def __new__(cls, line=None, utf8_byte_offset=None):
+        # type: (Optional[int], Optional[int]) -> None
+        return super(Offset, cls).__new__(cls, line, utf8_byte_offset)
+
+
+class Token(
+    collections.namedtuple(
+        'Token', ('name', 'src', 'line', 'utf8_byte_offset'),
+    ),
+):
+    __slots__ = ()
+
+    def __new__(cls, name, src, line=None, utf8_byte_offset=None):
+        # type: (str, str, Optional[int], Optional[int]) -> None
+        return super(Token, cls).__new__(
+            cls, name, src, line, utf8_byte_offset,
+        )
+
+    @property
+    def offset(self):  # type: () -> Offset
+        return Offset(self.line, self.utf8_byte_offset)
+
+
+_string_re = re.compile('^([^\'"]*)(.*)$', re.DOTALL)
+_string_prefixes = frozenset('bfru')
+_escaped_nl_re = re.compile(r'\\(\n|\r\n|\r)')
+
+
+def _re_partition(regex, s):
+    # type: (Pattern[str], str) -> Tuple[str, str, str]
+    match = regex.search(s)
+    if match:
+        return s[:match.start()], s[slice(*match.span())], s[match.end():]
+    else:
+        return (s, '', '')
+
+
+def src_to_tokens(src):  # type: (str) -> List[Token]
+    tokenize_target = io.StringIO(src)
+    lines = ('',) + tuple(tokenize_target)
+
+    tokenize_target.seek(0)
+
+    tokens = []
+    last_line = 1
+    last_col = 0
+
+    for (
+            tok_type, tok_text, (sline, scol), (eline, ecol), line,
+    ) in tokenize.generate_tokens(tokenize_target.readline):
+        if sline > last_line:
+            newtok = lines[last_line][last_col:]
+            for lineno in range(last_line + 1, sline):
+                newtok += lines[lineno]
+            if scol > 0:
+                newtok += lines[sline][:scol]
+
+            # a multiline unimportant whitespace may contain escaped newlines
+            while _escaped_nl_re.search(newtok):
+                ws, nl, newtok = _re_partition(_escaped_nl_re, newtok)
+                if ws:
+                    tokens.append(Token(UNIMPORTANT_WS, ws))
+                tokens.append(Token(ESCAPED_NL, nl))
+            if newtok:
+                tokens.append(Token(UNIMPORTANT_WS, newtok))
+
+        elif scol > last_col:
+            tokens.append(Token(UNIMPORTANT_WS, line[last_col:scol]))
+
+        tok_name = tokenize.tok_name[tok_type]
+        utf8_byte_offset = len(line[:scol].encode('UTF-8'))
+        # when a string prefix is not recognized, the tokenizer produces a
+        # NAME token followed by a STRING token
+        if (
+                tok_name == 'STRING' and
+                tokens and
+                tokens[-1].name == 'NAME' and
+                frozenset(tokens[-1].src.lower()) <= _string_prefixes
+        ):
+            newsrc = tokens[-1].src + tok_text
+            tokens[-1] = tokens[-1]._replace(src=newsrc, name=tok_name)
+        # produce octal literals as a single token in python 3 as well
+        elif (
+                tok_name == 'NUMBER' and
+                tokens and
+                tokens[-1].name == 'NUMBER'
+        ):  # pragma: no cover (PY3)
+            tokens[-1] = tokens[-1]._replace(src=tokens[-1].src + tok_text)
+        # produce long literals as a single token in python 3 as well
+        elif (
+                tok_name == 'NAME' and
+                tok_text.lower() == 'l' and
+                tokens and
+                tokens[-1].name == 'NUMBER'
+        ):  # pragma: no cover (PY3)
+            tokens[-1] = tokens[-1]._replace(src=tokens[-1].src + tok_text)
+        else:
+            tokens.append(Token(tok_name, tok_text, sline, utf8_byte_offset))
+        last_line, last_col = eline, ecol
+
+    return tokens
+
+
+def tokens_to_src(tokens):  # type: (Iterable[Token]) -> str
+    return ''.join(tok.src for tok in tokens)
+
+
+def reversed_enumerate(tokens):
+    # type: (Sequence[Token]) -> Generator[Tuple[int, Token], None, None]
+    for i in reversed(range(len(tokens))):
+        yield i, tokens[i]
+
+
+def parse_string_literal(src):  # type: (str) -> Tuple[str, str]
+    """parse a string literal's source into (prefix, string)"""
+    match = _string_re.match(src)
+    assert match is not None
+    return match.group(1), match.group(2)
+
+
+def rfind_string_parts(tokens, i):
+    # type: (Sequence[Token], int) -> Tuple[int, ...]
+    """find the indicies of the string parts of a (joined) string literal
+
+    - `i` should start at the end of the string literal
+    - returns `()` (an empty tuple) for things which are not string literals
+    """
+    ret = []
+    depth = 0
+    for i in range(i, -1, -1):
+        token = tokens[i]
+        if token.name == 'STRING':
+            ret.append(i)
+        elif token.name in NON_CODING_TOKENS:
+            pass
+        elif token.src == ')':
+            depth += 1
+        elif depth and token.src == '(':
+            depth -= 1
+            # if we closed the paren(s) make sure it was a parenthesized string
+            # and not actually a call
+            if depth == 0:
+                for j in range(i - 1, -1, -1):
+                    tok = tokens[j]
+                    if tok.name in NON_CODING_TOKENS:
+                        pass
+                    # this was actually a call and not a parenthesized string
+                    elif (
+                            tok.src in {']', ')'} or (
+                                tok.name == 'NAME' and
+                                tok.src not in keyword.kwlist
+                            )
+                    ):
+                        return ()
+                    else:
+                        break
+                break
+        elif depth:  # it looked like a string but wasn't
+            return ()
+        else:
+            break
+    return tuple(reversed(ret))
+
+
+def main(argv=None):  # type: (Optional[Sequence[str]]) -> int
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename')
+    args = parser.parse_args(argv)
+    with io.open(args.filename) as f:
+        tokens = src_to_tokens(f.read())
+
+    def no_u_repr(s):  # type: (str) -> str
+        return repr(s).lstrip('u')
+
+    for token in tokens:
+        if token.name == UNIMPORTANT_WS:
+            line, col = '?', '?'
+        else:
+            line, col = token.line, token.utf8_byte_offset
+        print(
+            '{}:{} {} {}'.format(
+                line, col, token.name, no_u_repr(token.src),
+            ),
+        )
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())